# DisTrack : Training and Evaluation of model

### 1. Imports

In [1]:
import sys
sys.path.append("..")
from utils.helper import save_file

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

### 2. Load and Train data

In [35]:
df = pd.read_csv("../data/labeled_logs.csv")

In [42]:
df

Unnamed: 0,hour,min,active_window,keystrokes,mouse_clicks,idle__time_sec,label,label_encoded
0,20,39,Haule Haule - Full Song | Rab Ne Bana Di Jodi ...,12,0,51.0,Focused,0
1,20,40,logger.py - DisTrack - Visual Studio Code,3,1,49.156,Focused,0
2,20,41,WhatsApp,2,8,0.0,Focused,0
3,20,42,WhatsApp,0,3,0.031,Focused,0
4,20,43,Spider Solitaire,0,12,0.0,Focused,0
5,20,44,Spider Solitaire,0,19,0.765,Focused,0
6,20,45,Spider Solitaire,2,16,0.0,Focused,0
7,20,46,Spider Solitaire,0,17,0.468,Focused,0
8,20,47,Spider Solitaire,0,17,1.109,Focused,0
9,20,48,Spider Solitaire,0,12,0.234,Neutral,1


In [43]:
label_map = {
    "Focused" : 0,
    "Neutral" : 1,
    "Distracted" : 2
}

In [44]:
df["label_encoded"] = df["label"].map(label_map)

In [45]:
if "timestamp" in df.columns:
    df = df.drop(columns=["timestamp", "sl_no"])

In [46]:
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
df["minute_sin"] = np.sin(2 * np.pi * df["min"] / 60)
df["minute_cos"] = np.cos(2 * np.pi * df["min"] / 60)

df = df.drop(columns = ["hour", "min"])

In [51]:
X = df.drop(columns = ["label", "label_encoded"])
y = df["label"]

In [52]:
X

Unnamed: 0,active_window,keystrokes,mouse_clicks,idle__time_sec,hour_sin,hour_cos,minute_sin,minute_cos
0,Haule Haule - Full Song | Rab Ne Bana Di Jodi ...,12,0,51.0,-0.866025,0.5,-0.809017,-0.5877853
1,logger.py - DisTrack - Visual Studio Code,3,1,49.156,-0.866025,0.5,-0.866025,-0.5
2,WhatsApp,2,8,0.0,-0.866025,0.5,-0.913545,-0.4067366
3,WhatsApp,0,3,0.031,-0.866025,0.5,-0.951057,-0.309017
4,Spider Solitaire,0,12,0.0,-0.866025,0.5,-0.978148,-0.2079117
5,Spider Solitaire,0,19,0.765,-0.866025,0.5,-0.994522,-0.1045285
6,Spider Solitaire,2,16,0.0,-0.866025,0.5,-1.0,-1.83697e-16
7,Spider Solitaire,0,17,0.468,-0.866025,0.5,-0.994522,0.1045285
8,Spider Solitaire,0,17,1.109,-0.866025,0.5,-0.978148,0.2079117
9,Spider Solitaire,0,12,0.234,-0.866025,0.5,-0.951057,0.309017


In [48]:
categorial_features = ["active_window"]
numeric_features = [col for col in X.columns if col != "active_window"]

In [49]:
preprocessor = ColumnTransformer(
    transformers = [
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorial_features),
        ('num', 'passthrough', numeric_features)
    ]
)

In [10]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

### 3. Model Training

In [12]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### 4. Predict and Evaluation

In [13]:
pred = pipeline.predict(X_test)

In [14]:
sum(pred == y_test) / len(y_test)

1.0

### 5. Save pipeline as pkl

In [15]:
save_file("pipeline.pkl", pipeline)

In [50]:
df

Unnamed: 0,active_window,keystrokes,mouse_clicks,idle__time_sec,label,label_encoded,hour_sin,hour_cos,minute_sin,minute_cos
0,Haule Haule - Full Song | Rab Ne Bana Di Jodi ...,12,0,51.0,Focused,0,-0.866025,0.5,-0.809017,-0.5877853
1,logger.py - DisTrack - Visual Studio Code,3,1,49.156,Focused,0,-0.866025,0.5,-0.866025,-0.5
2,WhatsApp,2,8,0.0,Focused,0,-0.866025,0.5,-0.913545,-0.4067366
3,WhatsApp,0,3,0.031,Focused,0,-0.866025,0.5,-0.951057,-0.309017
4,Spider Solitaire,0,12,0.0,Focused,0,-0.866025,0.5,-0.978148,-0.2079117
5,Spider Solitaire,0,19,0.765,Focused,0,-0.866025,0.5,-0.994522,-0.1045285
6,Spider Solitaire,2,16,0.0,Focused,0,-0.866025,0.5,-1.0,-1.83697e-16
7,Spider Solitaire,0,17,0.468,Focused,0,-0.866025,0.5,-0.994522,0.1045285
8,Spider Solitaire,0,17,1.109,Focused,0,-0.866025,0.5,-0.978148,0.2079117
9,Spider Solitaire,0,12,0.234,Neutral,1,-0.866025,0.5,-0.951057,0.309017


In [54]:
df = {"active_window":"WhatsApp", 'keystrokes': 12,  'mouse_clicks': 5, 'idle__time_sec': 51.00, 'hour_sin' : -0.866025,	'hour_cos' : 0.500000	,	'minute_sin' : -0.809017,	'minute_cos' : -5.877853e-01}
pipeline.predict(df)

ValueError: Expected 2D array, got scalar array instead:
array={'active_window': 'WhatsApp', 'keystrokes': 12, 'mouse_clicks': 5, 'idle__time_sec': 51.0, 'hour_sin': -0.866025, 'hour_cos': 0.5, 'minute_sin': -0.809017, 'minute_cos': -0.5877853}.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.