In [1]:
import pandas as pd
import json

# 🔧 Replace with your actual file path in Colab
file_path = '/content/aios_process_log_stream.json'

# ✅ Read JSON lines
data = []
with open(file_path, 'r') as f:
    for i, line in enumerate(f):
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping line {i} due to JSON error: {e}")
            continue

# ✅ Convert to DataFrame
df = pd.DataFrame(data)

# ✅ Display basic info
print("Columns in dataset:\n", df.columns.tolist())
print("\nDataset shape:", df.shape)

# ✅ Display first few rows to inspect structure
df.head()

# Install openpyxl if needed in Colab
!pip install openpyxl

# Save to Excel
df.to_excel('/content/structured_dataset.xlsx', index=False)
print ("saved")


Columns in dataset:
 ['Timestamp', 'PID', 'Name', 'State', 'PPid', 'Threads', 'VmRSS', 'Priority', 'Nice', 'CPU_Usage_%', 'Total_Time_Ticks', 'Elapsed_Time_sec', 'Voluntary_ctxt_switches', 'Nonvoluntary_ctxt_switches', 'Scheduling_Policy', 'Sched_Stats', 'IO_Stats', 'Cmdline']

Dataset shape: (109979, 18)
saved


In [4]:
import numpy as np
# 🔧 Clean and convert columns as needed
df['CPU_Usage_%'] = pd.to_numeric(df['CPU_Usage_%'], errors='coerce')
df['Nice'] = pd.to_numeric(df['Nice'], errors='coerce')
df['Priority'] = pd.to_numeric(df['Priority'], errors='coerce')
df['Total_Time_Ticks'] = pd.to_numeric(df['Total_Time_Ticks'], errors='coerce')
df['Elapsed_Time_sec'] = pd.to_numeric(df['Elapsed_Time_sec'], errors='coerce')
df['Voluntary_ctxt_switches'] = pd.to_numeric(df['Voluntary_ctxt_switches'], errors='coerce')
df['Nonvoluntary_ctxt_switches'] = pd.to_numeric(df['Nonvoluntary_ctxt_switches'], errors='coerce')

# 🔧 Derive avg_cpu_time safely
df['avg_cpu_time'] = df['Total_Time_Ticks'] / (df['Elapsed_Time_sec'].replace(0, np.nan))

import pandas as pd
import numpy as np

# ----------------------------------------
# 🔷 1. Resource Usage Classification
# ----------------------------------------

def classify_resource(row):
    cpu = row['CPU_Usage_%']
    v = row['Voluntary_ctxt_switches']
    nv = row['Nonvoluntary_ctxt_switches']

    if cpu > 50 and nv > v:
        return 'CPU-bound'
    elif cpu < 20 and v > nv:
        return 'IO-bound'
    else:
        return 'Mixed'

df['Resource_Type'] = df.apply(classify_resource, axis=1)

# ----------------------------------------
# 🔷 2. Interactivity Classification
# ----------------------------------------

def classify_interactivity(row):
    policy = row['Scheduling_Policy']
    nice = row['Nice']
    cpu = row['CPU_Usage_%']
    ticks = row['Total_Time_Ticks']

    if policy in ['SCHED_FIFO', 'SCHED_RR']:
        return 'Real-time'
    elif nice <= 0 and cpu < 30 and ticks < 500:
        return 'Interactive'
    elif nice > 10:
        return 'Background'
    elif cpu > 50 and ticks > 2000:
        return 'Batch'
    else:
        return 'Other'

df['Interactivity'] = df.apply(classify_interactivity, axis=1)

# ----------------------------------------
# 🔷 3. Priority Classification
# ----------------------------------------

def classify_priority(row):
    nice = row['Nice']
    if nice < 0:
        return 'High'
    elif nice == 0:
        return 'Medium'
    else:
        return 'Low'

df['Priority_Class'] = df.apply(classify_priority, axis=1)

# ----------------------------------------
# 🔷 4. Execution Time Classification
# ----------------------------------------

def classify_execution_time(row):
    ticks = row['Total_Time_Ticks']
    if ticks < 500:
        return 'Short'
    elif ticks < 2000:
        return 'Medium'
    else:
        return 'Long'

df['Execution_Time_Class'] = df.apply(classify_execution_time, axis=1)

# ----------------------------------------
# ✅ Check distributions to ensure correctness

print("\n🔷 Resource Usage Classification Distribution:")
print(df['Resource_Type'].value_counts())

print("\n🔷 Interactivity Classification Distribution:")
print(df['Interactivity'].value_counts())

print("\n🔷 Priority Classification Distribution:")
print(df['Priority_Class'].value_counts())

print("\n🔷 Execution Time Classification Distribution:")
print(df['Execution_Time_Class'].value_counts())

# ✅ Save the labeled dataset for model training later
df.to_csv("classified_dataset.csv", index=False)



🔷 Resource Usage Classification Distribution:
Resource_Type
IO-bound     106917
Mixed          3060
CPU-bound         2
Name: count, dtype: int64

🔷 Interactivity Classification Distribution:
Interactivity
Interactive    90869
Other           8532
Real-time       7240
Background      3316
Batch             22
Name: count, dtype: int64

🔷 Priority Classification Distribution:
Priority_Class
Medium    84649
High      21290
Low        4040
Name: count, dtype: int64

🔷 Execution Time Classification Distribution:
Execution_Time_Class
Short     102063
Medium      4927
Long        2989
Name: count, dtype: int64


In [5]:
print(df.isnull().sum())


Timestamp                     0
PID                           0
Name                          0
State                         0
PPid                          0
Threads                       0
VmRSS                         0
Priority                      0
Nice                          0
CPU_Usage_%                   0
Total_Time_Ticks              0
Elapsed_Time_sec              0
Voluntary_ctxt_switches       0
Nonvoluntary_ctxt_switches    0
Scheduling_Policy             0
Sched_Stats                   0
IO_Stats                      0
Cmdline                       0
avg_cpu_time                  0
Resource_Type                 0
Interactivity                 0
Priority_Class                0
Execution_Time_Class          0
dtype: int64
