### Lectura de datos ###

In [None]:
import pandas as pd
df = pd.read_json("raw_logs.json")

### Filtrado de datos ###

Filtra los logs que no posean nivel

In [None]:
df_filtrado = df.fillna(0)
df_filtrado = df_filtrado[df_filtrado["level"] > 0]
df_filtrado
df_filtrado.describe()

Filtrado de outlayers de forma visual

In [None]:
df_filtrado = df.fillna(0)
df_filtrado = df_filtrado[df_filtrado["level"] > 0]

df_filtrado = df_filtrado[df_filtrado["avgFocusDuration"] < 2500]

#avgPauseDuration > 0
df_filtrado = df_filtrado[df_filtrado["avgPauseDuration"] > 0]
#avgPauseDuration < 8000
df_filtrado = df_filtrado[df_filtrado["avgPauseDuration"] < 8000]

#avgSpeedPerTest < 0.23
df_filtrado = df_filtrado[df_filtrado["avgSpeedPerTest"] < 0.23]

#clicksPerTest < 70
df_filtrado = df_filtrado[df_filtrado["clicksPerTest"] < 70]

#timePerTest < 400000
df_filtrado = df_filtrado[df_filtrado["timePerTest"] < 400000]

#totalPauses < 24
df_filtrado = df_filtrado[df_filtrado["totalPauses"] < 24]

#avgScrollSpeed < 0.13
df_filtrado = df_filtrado[df_filtrado["avgScrollSpeed"]  < 0.13]

#avgIntraTecla < 2000
df_filtrado = df_filtrado[df_filtrado["avgIntratecla"] < 2000]

#totalBorrado < 15
df_filtrado = df_filtrado[df_filtrado["totalBorrado"] < 15]

df_filtrado.describe()

pd.plotting.scatter_matrix(df_filtrado, figsize=(50,50))

In [None]:
import matplotlib.pyplot as plt

column_names = df_filtrado.columns.values.tolist()
column_names.remove('name')

for col in column_names:
    df_filtrado.boxplot(column=[col], figsize=(10,10))
    plt.show()

Análisis de outliers con distancia de Mahalanobis

In [None]:
import numpy as np
from scipy.stats import chi2

df_mahalanobis = df_filtrado.copy()
df_mahalanobis.pop("name")
df_mahalanobis = df_mahalanobis.to_numpy()

# Matriz de covarianza
covariance = np.cov(df_mahalanobis, rowvar=False)

# Matriz de covarianza a la potencia de -1
covariance_pm1 = np.linalg.matrix_power(covariance, -1)

# Punto central
centerpoint = np.mean(df_mahalanobis, axis=0)

distances = []
for i, val in enumerate(df_mahalanobis):
    p1 = val
    p2 = centerpoint
    distance = (p1-p2).T.dot(covariance_pm1).dot(p1-p2)
    distances.append(distance)
distances = np.array(distances)

# valor de corde de distribucion ji-cuadrada para detectar anomalias
cutoff = chi2.ppf(0.95, df_mahalanobis.shape[1])
print(cutoff)

# indice de los outliers
outlierIndexes = np.where(distances > cutoff)

print('--- Indices de outliers ---')
print(outlierIndexes)

print('--- Muestras encontradas como outliers ---')
print(df_mahalanobis[distances > cutoff, :])

df_mahalanobis

# falta remover los outliers encontrados de la muestra de datos

Análisis y normalización de velocidades

In [None]:
def remove_outliers(df, value):
    df = df.copy()
    Q1 = df[value].quantile(0.25)
    Q3 = df[value].quantile(0.75)
    IQR = Q3 - Q1 
    filter = (df[value] >= Q1 - 1.5 * IQR) & (df[value] <= Q3 + 1.5 *IQR)
    return df.loc[filter]      

df_normalized = df_filtrado.copy()
column_names = df_filtrado.columns.values.tolist()
column_names.remove('avgSpeedPerTest')
column_names.remove('avgIntratecla')
column_names.remove('avgScrollSpeed')
column_names.remove('level')

for name in column_names:
    df_normalized.pop(name)

df_normalized = df_normalized[df_normalized["avgScrollSpeed"] > 0]

df_normalized = remove_outliers(df_normalized, 'avgSpeedPerTest')
df_normalized = remove_outliers(df_normalized, 'avgIntratecla')
df_normalized = remove_outliers(df_normalized, 'avgScrollSpeed')

for column in df_normalized.columns:
    if (column == "level"):
        continue
    df_normalized[column] = df_normalized[column]  / df_normalized[column].abs().max()

print(df_normalized.describe())
df_normalized.boxplot(figsize=(15,15))

import os
from pathlib import Path
filepath = os.path.join(Path().absolute(), 'df_normalized.csv')
df_normalized.to_csv(filepath)

### Elección de métricas ###

Ejecutar solo uno de los siguientes códigos para elegir qué datos se utilizaran en los entrenamientos siguientes

- Elegir datos filtrados visualmente

In [None]:
feature_cols = df_filtrado.columns.values.tolist()
feature_cols.remove('name')
feature_cols.remove('level')

X_df = df_filtrado.loc[:, feature_cols]
X_df.shape

y = df_filtrado['level']
y.shape

X = X_df.to_numpy()
y = y.to_numpy()

- Elegir solo velocidades normalizadas

In [None]:
feature_cols = df_normalized.columns.values.tolist()
feature_cols.remove('level')

X_df = df_normalized.loc[:, feature_cols]
X_df.shape

y = df_normalized['level']
y.shape

X = X_df.to_numpy()
y = y.to_numpy()

- Elegir datos filtrados por Mahalanobis (proximamente)

### Definición de ModelWrapper y entrenamiento KFold ###

In [None]:
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from abc import ABC, abstractclassmethod

class ModelWrapper(ABC):
    @abstractclassmethod
    def evaluate(self, x, y):
        pass

    @abstractclassmethod
    def metrics(self):
        pass

    @abstractclassmethod
    def fit(self, x_train, y_train):
        pass

    @abstractclassmethod
    def remake(self):
        pass
        
def KFoldTraining(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
    fold_number = 1

    metrics = {}
    for metric in model.metrics():
        metrics[metric] = []

    for train, test in cv.split(X, y):
        x_train = X[train]
        y_train = y[train]
        x_test = X[test]
        y_test = y[test]

        sd = StandardScaler()
        sd.fit(x_train)
        x_train = sd.transform(x_train)
        x_test = sd.transform(x_test)

        model.remake()
        model.fit(x_train, y_train)

        print("------------------------------------------")
        print("FOLD NUMBER " + str(fold_number))
        fold_number += 1

        training_result = model.evaluate(x_train, y_train)
        print("Training evaluate results")
        print(training_result)

        testing_result = model.evaluate(x_test, y_test)
        print("Testing evaluate results")
        print(testing_result)

        for metric in model.metrics():
            metrics[metric].append(testing_result[metric])

        print("------------------------------------------")
        print("")

    for metric in model.metrics():
        print('%s: %.3f (%.3f)' % (metric, np.mean(metrics[metric]), np.std(metrics[metric])))

### Entrenamiento con LinearRegression ###

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

class LinearRegressionWrapper(ModelWrapper):
    def __init__(self):
        self.model = LinearRegression()

    def evaluate(self, x, y):
        y_pred = self.model.predict(x)
        return {
            'MAE': metrics.mean_absolute_error(y, y_pred),
            'MSE': metrics.mean_squared_error(y, y_pred),
            'R^2': metrics.r2_score(y,y_pred)
        }

    def metrics(self):
        return ['MAE', 'MSE', 'R^2']

    def fit(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def remake(self):
        self.model = LinearRegression()

KFoldTraining(LinearRegressionWrapper(), X, y)

### Entrenamiento con SelectFromModel para LinearRegression ###

In [None]:
from sklearn.feature_selection import SelectFromModel

class SLFLinearRegressionWrapper(LinearRegressionWrapper):
    def __init__(self):
        self.model = SelectFromModel(LinearRegression())
    
    def evaluate(self, x, y):
        y_pred = self.model.estimator_.predict(x)
        return {
            'MAE': metrics.mean_absolute_error(y, y_pred),
            'MSE': metrics.mean_squared_error(y, y_pred),
            'R^2': metrics.r2_score(y,y_pred)
        }

    def remake(self):
        self.model = SelectFromModel(LinearRegression())

KFoldTraining(SLFLinearRegressionWrapper(), X, y)

### Entrenamiento con DecisionTreeClassifier ###

In [None]:
from sklearn import tree


class DecisionTreeClassifierWrapper(ModelWrapper):
    def __init__(self):
        self.model = tree.DecisionTreeClassifier()

    def evaluate(self, x, y):
        y_pred = self.model.predict(x)
        return {
            'accuracy_score': metrics.accuracy_score(y, y_pred),
            'precision_score': metrics.precision_score(y, y_pred, average='micro'),
            'recall_score': metrics.recall_score(y, y_pred, average='micro'),
            'f1_score': metrics.f1_score(y, y_pred, average='micro'),
            'confusion_matrix': metrics.confusion_matrix(y, y_pred)
        }

    def metrics(self):
        return ['accuracy_score', 'precision_score', 'recall_score', 'confusion_matrix']

    def fit(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def remake(self):
        self.model = tree.DecisionTreeClassifier()

KFoldTraining(DecisionTreeClassifierWrapper(), X, y)

### Pruebas de feature selection ###

In [None]:
m = LinearRegression()
m.fit(X,y)
#evaluate(m, X, y)


In [None]:
# https://scikit-learn.org/stable/modules/feature_selection.html

from sklearn.feature_selection import RFECV

m = RFECV(LinearRegression())
m.fit(X,y)
#evaluate(m, X, y)

In [None]:
m.support_

In [None]:
m.ranking_

In [None]:
m.get_support(1)

In [None]:
from sklearn.feature_selection import SelectFromModel

m = SelectFromModel(LinearRegression())
m.fit(X,y)
m.transform(X).shape
m.get_support()
column_names = X_df.columns[m.get_support()]
column_names 

In [None]:
m = SelectFromModel(tree.DecisionTreeClassifier())
m.fit(X,y)
m.transform(X).shape
m.get_support()
column_names = X_df.columns[m.get_support()]
column_names