# DisTrack : Training and Evaluation of model

### 1. Imports

In [61]:
import sys
sys.path.append("..")
from utils.helper import save_file, cleaner

In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

### 2. Load and Prepare data

In [77]:
df

Unnamed: 0,active_window,keystrokes,mouse_clicks,idle_time_sec,label_encoded,keystroke_per_sec,mouse_clicks_per_sec,activity_rate,idle_ratio,key_mouse_ratio,idle_to_active_ratio
0,life in limbo oijage feat nick paona officia...,0,3,52.141,2,0.000000,0.050000,0.338639,0.869017,0.000000,13.035250
1,life in limbo oijage feat nick paona officia...,0,0,15.235,2,0.000000,0.000000,0.000000,0.253917,0.000000,15.235000
2,life in limbo oijage feat nick paona officia...,0,0,75.235,2,0.000000,0.000000,-0.000000,1.253917,0.000000,75.235000
3,durand cup imphal 🏆 neroca vs real kashmir ...,0,3,3.031,2,0.000000,0.050000,0.051752,0.050517,0.000000,0.757750
4,the final that stopped our hearts full highli...,0,1,14.250,2,0.000000,0.016667,0.021390,0.237500,0.000000,7.125000
...,...,...,...,...,...,...,...,...,...,...,...
192,untitled and more page personal microsoft​ ...,1,10,0.125,0,0.016667,0.166667,0.180698,0.002083,0.090909,0.010417
193,untitled and more page personal microsoft​ ...,50,0,3.578,0,0.833333,0.000000,0.870746,0.059633,50.000000,0.070157
194,untitled and more page personal microsoft​ ...,0,0,63.578,0,0.000000,0.000000,-0.000000,1.059633,0.000000,63.578000
195,untitled and more page personal microsoft​ ...,0,0,123.578,0,0.000000,0.000000,-0.000000,2.059633,0.000000,123.578000


In [64]:
df = pd.read_csv("../data/1minDataset.csv")

In [65]:
label_map = {
    "Focused" : 0,
    "Neutral" : 1,
    "Distracted" : 2
}

df['active_window'] = df['active_window'].map(cleaner)
df["label_encoded"] = df["label"].map(label_map)

In [66]:
df = df.drop(columns = ["timestamp", "hour", "min", "label"])
#keep till future development

In [67]:
df['keystroke_per_sec'] = df['keystrokes'] / 60
df['mouse_clicks_per_sec'] = df['mouse_clicks'] / 60
df['activity_rate'] = (df['keystrokes'] + df['mouse_clicks']) / (60 - df['idle_time_sec'] + 1)

df['idle_ratio'] = df['idle_time_sec'] / 60
df['key_mouse_ratio'] = df['keystrokes'] / (df['mouse_clicks'] + 1)
df['idle_to_active_ratio'] = df['idle_time_sec'] / (df['keystrokes'] + df['mouse_clicks'] + 1)

### 3. Train data

In [68]:
X = df.drop(columns = ["label_encoded"])
y = df["label_encoded"]

In [69]:
numeric_features = [col for col in X.columns if col != "active_window"]

In [70]:
text_transformer = Pipeline(steps=[
     ('tfidf', TfidfVectorizer())   
])

In [71]:
preprocessor = ColumnTransformer(
    transformers = [
        ('text', text_transformer, 'active_window'),
        ('num', 'passthrough', numeric_features)
    ]
)

In [72]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

### 4. Model Training

In [74]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### 4. Predict and Evaluation

In [75]:
pred = pipeline.predict(X_test)

In [76]:
sum(pred == y_test) / len(y_test)

0.875

### 5. Save pipeline as pkl

In [78]:
save_file("pipeline.pkl", pipeline)