# DisTrack: Training and Evaluation of model  

### 1.Import 

In [104]:
import sys
sys.path.append("../..")
from utils.helper import save_file,cleaner

In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

### 2. Load and Prepare data 

In [106]:
df = pd.read_csv("../../data/1minDataset.csv")

In [107]:
df

Unnamed: 0,timestamp,hour,min,active_window,keystrokes,mouse_clicks,idle_time_sec,label
0,2025-08-17 12:44:05,12,44,(42) LIFE IN LIMBO - OIJAGE FEAT. NICK PAONA |...,0,3,52.141,Distracted
1,2025-08-17 12:45:05,12,45,(42) LIFE IN LIMBO - OIJAGE FEAT. NICK PAONA |...,0,0,15.235,Distracted
2,2025-08-17 12:46:05,12,46,(42) LIFE IN LIMBO - OIJAGE FEAT. NICK PAONA |...,0,0,75.235,Distracted
3,2025-08-17 12:49:25,12,49,(42) Durand cup imphal 🏆 | Neroca vs Real Kash...,0,3,3.031,Distracted
4,2025-08-17 12:50:25,12,50,(42) The Final That Stopped Our Hearts: Full H...,0,1,14.250,Distracted
...,...,...,...,...,...,...,...,...
192,2025-08-13 19:31:22,19,31,Untitled2 and 1 more page - Personal - Microso...,1,10,0.125,Focused
193,2025-08-13 19:32:22,19,32,Untitled2 and 1 more page - Personal - Microso...,50,0,3.578,Focused
194,2025-08-13 19:33:22,19,33,Untitled2 and 1 more page - Personal - Microso...,0,0,63.578,Focused
195,2025-08-13 19:34:22,19,34,Untitled2 and 1 more page - Personal - Microso...,0,0,123.578,Focused


In [108]:
label_map ={
    "Focused" : 0,
    "Neutral" : 1,
    "Distracted" : 2
}
df['active_window'] = df['active_window'].map(cleaner)
df["label_encoded"] = df["label"].map(label_map)

In [109]:
df = df.drop(columns = ["timestamp", "hour","min","label"])

In [110]:
df['keystroke_per_sec'] = df['keystrokes'] / 60
df['mouse_clicks_per_sec'] = df['mouse_clicks'] / 60
df['activity_rate'] = (df['keystrokes'] + df['mouse_clicks']) / (60 - df['idle_time_sec'] + 1)

df['idle_ratio'] = df['idle_time_sec'] / 60
df['key_mouse_ratio'] = df['keystrokes'] / (df['keystrokes'] + df['mouse_clicks'] + 1)
df['idle_to_active_ratio'] = df['idle_time_sec'] / (df['keystrokes'] + df['mouse_clicks'] + 1)

### 3.Train data

In [111]:
X = df.drop(columns = ["label_encoded"])
y = df["label_encoded"]

In [112]:
numeric_features =  [col for col in X.columns if col != "active_window"]

In [113]:
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer())
                                  
 ])

In [114]:
preprocessor = ColumnTransformer(
    transformers = [
        ('text', text_transformer, 'active_window'),
        ('num', 'passthrough', numeric_features)
    ]
)

In [115]:
pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier())
    ])

In [116]:
X_train,X_test,y_train,y_test = train_test_split(X, y,  test_size=0.2, random_state=42, stratify=y)

### 4.Model Training

In [117]:
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


### 5. Predict and Evaluation

In [118]:
pred = pipeline.predict(X_test)

In [119]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.71      0.77      0.74        13
           1       0.56      0.67      0.61        15
           2       0.62      0.42      0.50        12

    accuracy                           0.62        40
   macro avg       0.63      0.62      0.62        40
weighted avg       0.63      0.62      0.62        40



### 6. Save pipeline as pkl

In [120]:
save_file("pipeline.pkl", pipeline)