# Distrack : Traininng and Evaluation of model

### 1.Imports 

In [2]:
import sys
sys.path.append("../..")
from utils.helper import save_file, cleaner

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer

### 2.Load and Prepare data

In [4]:
df = pd.read_csv("../../data/1minDataset.csv")

In [19]:
df

Unnamed: 0,active_window,keystrokes,mouse_clicks,idle_time_sec,label_encoded,keystroke_per_sec,mouse_clicks_per_sec,activity_rate,idle_ratio,key_mouse_ratio,idle_to_active_ratio
0,life in limbo oijage feat nick paona officia...,0,3,52.141,2,0.000000,0.050000,0.338639,0.869017,0.000000,13.035250
1,life in limbo oijage feat nick paona officia...,0,0,15.235,2,0.000000,0.000000,0.000000,0.253917,0.000000,15.235000
2,life in limbo oijage feat nick paona officia...,0,0,75.235,2,0.000000,0.000000,-0.000000,1.253917,0.000000,75.235000
3,durand cup imphal 🏆 neroca vs real kashmir ...,0,3,3.031,2,0.000000,0.050000,0.051752,0.050517,0.000000,0.757750
4,the final that stopped our hearts full highli...,0,1,14.250,2,0.000000,0.016667,0.021390,0.237500,0.000000,7.125000
...,...,...,...,...,...,...,...,...,...,...,...
192,untitled and more page personal microsoft​ ...,1,10,0.125,0,0.016667,0.166667,0.180698,0.002083,0.090909,0.010417
193,untitled and more page personal microsoft​ ...,50,0,3.578,0,0.833333,0.000000,0.870746,0.059633,50.000000,0.070157
194,untitled and more page personal microsoft​ ...,0,0,63.578,0,0.000000,0.000000,-0.000000,1.059633,0.000000,63.578000
195,untitled and more page personal microsoft​ ...,0,0,123.578,0,0.000000,0.000000,-0.000000,2.059633,0.000000,123.578000


In [6]:
label_map = {
    "Focused" : 0,
    "Neutral" : 1,
    "Distracted" : 2
}

df['active_window'] = df['active_window'].map(cleaner)
df['label_encoded'] = df["label"].map(label_map)

In [7]:
df = df.drop(columns = ["timestamp", "hour", "min", "label"])

In [8]:
df['keystroke_per_sec'] = df['keystrokes'] / 60
df['mouse_clicks_per_sec'] = df['mouse_clicks'] / 60
df['activity_rate'] = (df['keystrokes'] + df['mouse_clicks']) / (60 - df['idle_time_sec'] + 1)

df['idle_ratio'] = df['idle_time_sec'] / 60
df['key_mouse_ratio'] = df['keystrokes'] / (df['mouse_clicks'] + 1)
df['idle_to_active_ratio'] = df['idle_time_sec'] / (df['keystrokes'] + df['mouse_clicks'] + 1)

### 3.Train data

In [9]:
X = df.drop(columns = ["label_encoded"])
y = df["label_encoded"]

In [10]:
numeric_features = [col for col in X.columns if col != "active_window"]

In [11]:
text_transformer = Pipeline(steps=[
    ('tfdif', TfidfVectorizer())
])

In [12]:
preprocessor = ColumnTransformer(
    transformers = [
        ('text', text_transformer, 'active_window'),
        ('num', 'passthrough', numeric_features)
    ]
)

In [13]:
def sparse_to_dense(x):
    return x.toarray()

In [14]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ("to_dense", FunctionTransformer(sparse_to_dense, accept_sparse=True)),
    ('classifier', GaussianNB())
])

In [15]:
X_train, X_test, y_train,y_test= train_test_split(X,y , test_size=0.2, random_state=42, stratify=y)

### 4.Model Train

In [16]:
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('to_dense', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,func,<function spa...00150489CE160>
,inverse_func,
,validate,False
,accept_sparse,True
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,priors,
,var_smoothing,1e-09


### 5.Predict and Evaluate

In [17]:
pred = pipeline.predict(X_test)

In [18]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.93      0.93      0.93        15
           2       0.92      0.92      0.92        12

    accuracy                           0.95        40
   macro avg       0.95      0.95      0.95        40
weighted avg       0.95      0.95      0.95        40



### 6.Save pipeline as pkl

In [20]:
save_file("pipeline.pkl", pipeline)