In [1]:
import pandas as pd

# Just update with the actual path if needed
file_path = '/content/classified_dataset.csv'

# ✅ Load the CSV
df = pd.read_csv(file_path)

# ✅ Peek into the dataset
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()


Shape: (109979, 23)
Columns: ['Timestamp', 'PID', 'Name', 'State', 'PPid', 'Threads', 'VmRSS', 'Priority', 'Nice', 'CPU_Usage_%', 'Total_Time_Ticks', 'Elapsed_Time_sec', 'Voluntary_ctxt_switches', 'Nonvoluntary_ctxt_switches', 'Scheduling_Policy', 'Sched_Stats', 'IO_Stats', 'Cmdline', 'avg_cpu_time', 'Resource_Type', 'Interactivity', 'Priority_Class', 'Execution_Time_Class']


Unnamed: 0,Timestamp,PID,Name,State,PPid,Threads,VmRSS,Priority,Nice,CPU_Usage_%,...,Nonvoluntary_ctxt_switches,Scheduling_Policy,Sched_Stats,IO_Stats,Cmdline,avg_cpu_time,Resource_Type,Interactivity,Priority_Class,Execution_Time_Class
0,2025-07-20T02:27:49.367028,1,systemd,S (sleeping),0,1,13808 kB,20,0,0.031987,...,575,SCHED_OTHER,"systemd (1, #threads: 1)\n--------------------...",,/sbin/init splash,0.031987,IO-bound,Interactive,Medium,Short
1,2025-07-20T02:27:49.367161,2,kthreadd,S (sleeping),0,1,0 kB,20,0,0.002861,...,3,SCHED_OTHER,"kthreadd (2, #threads: 1)\n-------------------...",,,0.002861,IO-bound,Interactive,Medium,Short
2,2025-07-20T02:27:49.367255,3,pool_workqueue_release,S (sleeping),2,1,0 kB,20,0,0.0,...,0,SCHED_OTHER,"pool_workqueue_ (3, #threads: 1)\n------------...",,,0.0,IO-bound,Interactive,Medium,Short
3,2025-07-20T02:27:49.367346,4,kworker/R-rcu_gp,I (idle),2,1,0 kB,0,-20,0.0,...,0,SCHED_OTHER,"kworker/R-rcu_g (4, #threads: 1)\n------------...",,,0.0,IO-bound,Interactive,High,Short
4,2025-07-20T02:27:49.367428,5,kworker/R-sync_wq,I (idle),2,1,0 kB,0,-20,0.0,...,0,SCHED_OTHER,"kworker/R-sync_ (5, #threads: 1)\n------------...",,,0.0,IO-bound,Interactive,High,Short


In [2]:
train_raw = df.iloc[:50000]
test_raw = df.iloc[50000:75000]
validate_raw = df.iloc[75000:]

train_raw.to_csv("train_raw.csv", index=False)
test_raw.to_csv("test_raw.csv", index=False)
validate_raw.to_csv("validate_raw.csv", index=False)


In [12]:
import pandas as pd

# 🔹 Load the training dataset
train_raw = pd.read_csv('/content/train_raw.csv')

# 🔍 List of labels we plan to model
label_columns = ['Resource_Type', 'Interactivity', 'Priority_Class', 'Execution_Time_Class']

# 📊 Show distribution for each
for col in label_columns:
    print(f"\n🔸 Distribution of '{col}':")
    print(train_raw[col].value_counts())



🔸 Distribution of 'Resource_Type':
Resource_Type
IO-bound     48671
Mixed         1328
CPU-bound        1
Name: count, dtype: int64

🔸 Distribution of 'Interactivity':
Interactivity
Interactive    41295
Other           3870
Real-time       3300
Background      1535
Name: count, dtype: int64

🔸 Distribution of 'Priority_Class':
Priority_Class
Medium    38489
High       9646
Low        1865
Name: count, dtype: int64

🔸 Distribution of 'Execution_Time_Class':
Execution_Time_Class
Short     46461
Medium     2248
Long       1291
Name: count, dtype: int64


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

def preprocess_dataset(df):
    # 🧼 Clean & convert
    df['VmRSS'] = df['VmRSS'].str.replace('kB', '', regex=False).astype(float)
    df['CPU_Usage_%'] = pd.to_numeric(df['CPU_Usage_%'], errors='coerce').fillna(0)
    df['Nice'] = pd.to_numeric(df['Nice'], errors='coerce').fillna(0)
    df['Priority'] = pd.to_numeric(df['Priority'], errors='coerce').fillna(0)
    df['Total_Time_Ticks'] = pd.to_numeric(df['Total_Time_Ticks'], errors='coerce').fillna(0)
    df['Elapsed_Time_sec'] = pd.to_numeric(df['Elapsed_Time_sec'], errors='coerce').replace(0, 1e-5)
    df['Voluntary_ctxt_switches'] = pd.to_numeric(df['Voluntary_ctxt_switches'], errors='coerce').fillna(0)
    df['Nonvoluntary_ctxt_switches'] = pd.to_numeric(df['Nonvoluntary_ctxt_switches'], errors='coerce').fillna(0)

    # 🔧 Engineered features
    df['cpu_to_elapsed_ratio'] = df['CPU_Usage_%'] / df['Elapsed_Time_sec']
    df['interactivity_score'] = df['Voluntary_ctxt_switches'] / (df['Nonvoluntary_ctxt_switches'] + 1)
    df['is_sleeping'] = df['State'].str.lower().str.contains('sleeping', na=False).astype(int)

    # Dictionary to hold encoders
    encoders = {}

    # 🔠 Encode categoricals and save encoders
    for col in ['State', 'Scheduling_Policy']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        encoders[col] = le
        joblib.dump(le, f'le_{col.lower()}.pkl')  # Save with lowercase filenames like le_state.pkl

    # 🧹 Drop unnecessary columns
    drop_cols = ['Timestamp', 'PID', 'Name', 'PPid', 'Cmdline', 'Sched_Stats', 'IO_Stats']
    df.drop(columns=drop_cols, errors='ignore', inplace=True)

    return df

# ✨ Apply to both train and test
train_raw = pd.read_csv('/content/train_raw.csv')
test_raw = pd.read_csv('/content/test_raw.csv')

train_processed = preprocess_dataset(train_raw)
test_processed = preprocess_dataset(test_raw)

# 💾 Save cleaned versions
train_processed.to_csv('/content/train_processed.csv', index=False)
test_processed.to_csv('/content/test_processed.csv', index=False)

print("✅ Preprocessing done! Ready for SMOTE and model training.")


✅ Preprocessing done! Ready for SMOTE and model training.


In [5]:
df = pd.read_csv('/content/train_processed.csv')

print("\n🔸 Resource_Type distribution:\n", df['Resource_Type'].value_counts())
print("\n🔸 Interactivity distribution:\n", df['Interactivity'].value_counts())
print("\n🔸 Priority_Class distribution:\n", df['Priority_Class'].value_counts())
print("\n🔸 Execution_Time_Class distribution:\n", df['Execution_Time_Class'].value_counts())



🔸 Resource_Type distribution:
 Resource_Type
IO-bound     48671
Mixed         1328
CPU-bound        1
Name: count, dtype: int64

🔸 Interactivity distribution:
 Interactivity
Interactive    41295
Other           3870
Real-time       3300
Background      1535
Name: count, dtype: int64

🔸 Priority_Class distribution:
 Priority_Class
Medium    38489
High       9646
Low        1865
Name: count, dtype: int64

🔸 Execution_Time_Class distribution:
 Execution_Time_Class
Short     46461
Medium     2248
Long       1291
Name: count, dtype: int64


In [6]:
from sklearn.utils import resample

df = pd.read_csv('/content/train_processed.csv')

# Separate minority class
cpu_df = df[df['Resource_Type'] == 'CPU-bound']
if len(cpu_df) < 6:
    cpu_df_upsampled = resample(cpu_df, replace=True, n_samples=10, random_state=42)
    df = pd.concat([df[df['Resource_Type'] != 'CPU-bound'], cpu_df_upsampled])
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.to_csv('/content/train_smote_ready.csv', index=False)
print("✅ Train data ready for SMOTE!")


✅ Train data ready for SMOTE!


In [7]:
!pip install -U imbalanced-learn




In [8]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

def apply_smote(df, label_col, feature_cols):
    print(f"\n🔹 Balancing for: {label_col}")

    X = df[feature_cols]
    y = df[label_col]

    # Encode labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y_encoded)

    df_resampled = pd.DataFrame(X_resampled, columns=feature_cols)
    df_resampled[label_col] = le.inverse_transform(y_resampled)

    print(f"✅ After SMOTE: {df_resampled[label_col].value_counts().to_dict()}")
    return df_resampled, le


base_feats = ['VmRSS', 'CPU_Usage_%', 'Nice', 'Priority', 'Threads',
              'Voluntary_ctxt_switches', 'Nonvoluntary_ctxt_switches',
              'Scheduling_Policy', 'State', 'Total_Time_Ticks', 'Elapsed_Time_sec']

resource_feats     = base_feats + ['interactivity_score']
interactivity_feats = base_feats + ['interactivity_score', 'is_sleeping']
priority_feats     = base_feats + ['avg_cpu_time', 'cpu_to_elapsed_ratio']
execution_feats    = base_feats + ['avg_cpu_time']

import json

json.dump(resource_feats, open("resource_features.json", "w"))
json.dump(interactivity_feats, open("interactivity_features.json", "w"))
json.dump(priority_feats, open("priority_features.json", "w"))
json.dump(execution_feats, open("execution_features.json", "w"))



In [9]:
# Load preprocessed data
train_df = pd.read_csv('/content/train_smote_ready.csv')

# Resource Type
resource_data, le_resource = apply_smote(train_df, 'Resource_Type', resource_feats)

# Interactivity
inter_data, le_inter = apply_smote(train_df, 'Interactivity', interactivity_feats)

# Priority Class
priority_data, le_priority = apply_smote(train_df, 'Priority_Class', priority_feats)

# Execution Time
execution_data, le_execution = apply_smote(train_df, 'Execution_Time_Class', execution_feats)

resource_data.to_csv('/content/balanced_resource_data.csv', index=False)
inter_data.to_csv('/content/balanced_interactivity_data.csv', index=False)
priority_data.to_csv('/content/balanced_priority_data.csv', index=False)
execution_data.to_csv('/content/balanced_execution_data.csv', index=False)

import joblib
joblib.dump(le_resource, 'le_resource.pkl')
joblib.dump(le_inter, 'le_inter.pkl')
joblib.dump(le_priority, 'le_priority.pkl')
joblib.dump(le_execution, 'le_execution.pkl')




🔹 Balancing for: Resource_Type
✅ After SMOTE: {'IO-bound': 48671, 'Mixed': 48671, 'CPU-bound': 48671}

🔹 Balancing for: Interactivity
✅ After SMOTE: {'Interactive': 41295, 'Other': 41295, 'Background': 41295, 'Real-time': 41295}

🔹 Balancing for: Priority_Class
✅ After SMOTE: {'Medium': 38498, 'High': 38498, 'Low': 38498}

🔹 Balancing for: Execution_Time_Class
✅ After SMOTE: {'Short': 46470, 'Long': 46470, 'Medium': 46470}


['le_execution.pkl']

In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# ✨ Load feature sets
import json
resource_feats     = json.load(open("resource_features.json"))
interactivity_feats = json.load(open("interactivity_features.json"))
priority_feats     = json.load(open("priority_features.json"))
execution_feats    = json.load(open("execution_features.json"))

# 🧠 Generic Training Function
def train_and_save_model(df_path, features, label_col, model_name):
    print(f"\n🔹 Training model for: {label_col}")
    df = pd.read_csv(df_path)

    X = df[features]
    y = df[label_col]

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)

    preds = model.predict(X)
    acc = accuracy_score(y, preds)
    print(f"✅ Accuracy on training set: {acc:.4f}")
    print(classification_report(y, preds))

    joblib.dump(model, f"{model_name}.pkl")
    print(f"💾 Model saved as {model_name}.pkl")

    return model

# 📦 Train each model
rf_resource_model = train_and_save_model("balanced_resource_data.csv", resource_feats, 'Resource_Type', 'rf_resource_model')
rf_inter_model    = train_and_save_model("balanced_interactivity_data.csv", interactivity_feats, 'Interactivity', 'rf_interactivity_model')
rf_priority_model = train_and_save_model("balanced_priority_data.csv", priority_feats, 'Priority_Class', 'rf_priority_model')
rf_execution_model= train_and_save_model("balanced_execution_data.csv", execution_feats, 'Execution_Time_Class', 'rf_execution_model')



🔹 Training model for: Resource_Type
✅ Accuracy on training set: 1.0000
              precision    recall  f1-score   support

   CPU-bound       1.00      1.00      1.00     48671
    IO-bound       1.00      1.00      1.00     48671
       Mixed       1.00      1.00      1.00     48671

    accuracy                           1.00    146013
   macro avg       1.00      1.00      1.00    146013
weighted avg       1.00      1.00      1.00    146013

💾 Model saved as rf_resource_model.pkl

🔹 Training model for: Interactivity
✅ Accuracy on training set: 1.0000
              precision    recall  f1-score   support

  Background       1.00      1.00      1.00     41295
 Interactive       1.00      1.00      1.00     41295
       Other       1.00      1.00      1.00     41295
   Real-time       1.00      1.00      1.00     41295

    accuracy                           1.00    165180
   macro avg       1.00      1.00      1.00    165180
weighted avg       1.00      1.00      1.00    165180

💾

In [11]:
import pandas as pd
import joblib
from sklearn.metrics import classification_report
import json

# ✨ Load test data
test_df = pd.read_csv("/content/test_processed.csv")

# ✨ Load feature sets
resource_feats     = json.load(open("resource_features.json"))
interactivity_feats = json.load(open("interactivity_features.json"))
priority_feats     = json.load(open("priority_features.json"))
execution_feats    = json.load(open("execution_features.json"))

# ✨ Load models
rf_resource_model = joblib.load("rf_resource_model.pkl")
rf_inter_model    = joblib.load("rf_interactivity_model.pkl")
rf_priority_model = joblib.load("rf_priority_model.pkl")
rf_execution_model= joblib.load("rf_execution_model.pkl")

# 🧠 Load LabelEncoders
le_resource = joblib.load("le_resource.pkl")
le_inter = joblib.load("le_inter.pkl")
le_priority = joblib.load("le_priority.pkl")
le_execution = joblib.load("le_execution.pkl")

# 🧪 Evaluation Function
from sklearn.metrics import classification_report

def evaluate_model(model, le, df, features, label_col):
    X = df[features]
    y_true = df[label_col].astype(str)  # Ensure string

    # Predict
    y_pred = model.predict(X)

    # Ensure predictions are string labels too
    y_pred = pd.Series(y_pred).astype(str)

    present_labels = sorted(y_true.unique())
    print(f"\n🔎 Evaluation for: {label_col}")
    print(classification_report(
        y_true, y_pred,
        labels=present_labels,
        target_names=present_labels
    ))


# 📊 Evaluate each model
evaluate_model(rf_resource_model, le_resource, test_df, resource_feats, "Resource_Type")
evaluate_model(rf_inter_model, le_inter, test_df, interactivity_feats, "Interactivity")
evaluate_model(rf_priority_model, le_priority, test_df, priority_feats, "Priority_Class")
evaluate_model(rf_execution_model, le_execution, test_df, execution_feats, "Execution_Time_Class")



🔎 Evaluation for: Resource_Type
              precision    recall  f1-score   support

    IO-bound       1.00      1.00      1.00     24341
       Mixed       1.00      1.00      1.00       659

    accuracy                           1.00     25000
   macro avg       1.00      1.00      1.00     25000
weighted avg       1.00      1.00      1.00     25000


🔎 Evaluation for: Interactivity
              precision    recall  f1-score   support

  Background       1.00      1.00      1.00       747
 Interactive       1.00      1.00      1.00     20534
       Other       1.00      1.00      1.00      2062
   Real-time       1.00      1.00      1.00      1657

    accuracy                           1.00     25000
   macro avg       1.00      1.00      1.00     25000
weighted avg       1.00      1.00      1.00     25000


🔎 Evaluation for: Priority_Class
              precision    recall  f1-score   support

        High       1.00      1.00      1.00      4817
         Low       1.00      