# MARATONA BEHIND THE CODE 2020

## DESAFIO 6 - ANAHUAC

## Installing Libs

In [None]:
!pip install scikit-learn --upgrade
!pip install seaborn --upgrade

<hr>

## Loading the .csv dataset from GitHub

In [147]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!wget --no-check-certificate --content-disposition https://raw.githubusercontent.com/vanderlei-test/dataset2/master/datasets/ForTraining.csv
df_base_for_training = pd.read_csv(r'ForTraining.csv')
df_base_for_training.head()

In [None]:
!wget --no-check-certificate --content-disposition https://raw.githubusercontent.com/vanderlei-test/dataset2/master/datasets/OrdenMaterias.csv
df_orden_materias = pd.read_csv(r'OrdenMaterias.csv')
df_orden_materias.head()

In [None]:
!wget --no-check-certificate --content-disposition https://raw.githubusercontent.com/vanderlei-test/dataset2/master/datasets/TablaConexiones.csv
df_tabla_conexiones = pd.read_csv(r'TablaConexiones.csv')
df_tabla_conexiones.head()

In [None]:
!wget --no-check-certificate --content-disposition https://raw.githubusercontent.com/vanderlei-test/dataset2/master/datasets/TablaTareas.csv
df_tabla_tareas = pd.read_csv(r'TablaTareas.csv')
df_tabla_tareas.head()

## Uniendo DataFrames en Pandas

In [None]:
# El resultado de esta celda sera la union de los dos anteriores dataframes
# usando la columna ``studentId`` como llave.

df = pd.merge(
    df_base_for_training, df_tabla_tareas, how='inner',
    on=None, left_on=['studentId', 'ciclo'], right_on=['studentId', 'ciclo'],
    left_index=False, right_index=False, sort=False,
    suffixes=('_x', '_y'), copy=True, indicator=False,
    validate=None
)

df = pd.merge(
    df, df_tabla_conexiones, how='inner',
    on=None, left_on=['studentId', 'ciclo'], right_on=['studentId', 'ciclo'],
    left_index=False, right_index=False, sort=False,
    suffixes=('_x', '_y'), copy=True, indicator=False,
    validate=None
)

df.tail()

## Pre-procesando el dataset antes de entrenar

In [None]:
# Visualizando los datos faltantes del dataset antes de la primera transformación (df_data_2)
print("Valores nulos antes de la transformación DropNA: \n\n{}\n".format(df.isnull().sum(axis = 0)))

In [155]:
# Aplicando la función para borrar todas las filas con valor NaN en la columna ``Graduado``:
df2 = df.dropna(axis='index', how='all', subset=['Graduado'])

In [None]:
# Visualizando los datos faltantes del dataset antes de la primera transformación (SimpleImputer) (df_data_3)
print("Valores nulos antes de la transformación SimpleImputer: \n\n{}\n".format(df2.isnull().sum(axis = 0)))

### Procesando valores NaN con SimpleImputer de sklearn

In [157]:
impute_zeros = SimpleImputer(
    missing_values=np.nan,
    strategy='constant',
    fill_value=0,
    verbose=0,
    copy=True
)

In [None]:
print("Valores nulos antes de transformación SimpleImputer: \n\n{}\n".format(df2.isnull().sum(axis = 0)))

impute_zeros.fit(X=df2)

df2 = pd.DataFrame.from_records(
    data=impute_zeros.transform(
        X=df2
    ),
    columns=df2.columns
)

print("Valores nulos del dataset despues de la transformación SimpleImputer: \n\n{}\n".format(df2.isnull().sum(axis = 0)))

### Manejando variables Categoricas

In [159]:
class ScalerOrdenMaterias(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        possibility_of_passing = 1
        decreased_possibility = 0
        data = X.copy()
        rows, total_cicles = data.shape

        for row in range(0, rows):
          modified_row = data.loc[row].values

          if 'Sin clase' in data.loc[row].values:
            without_class = data.loc[row].value_counts()['Sin clase']
          else:
            without_class = 0

          decreased_possibility = 1 / (total_cicles - without_class)

          for cicle in range(1, total_cicles):
              if modified_row[cicle] == 'Sin clase' or modified_row[cicle - 1] == 'Sin clase':
                modified_row[cicle] = 1
              else:
                modified_row[cicle] = possibility_of_passing
                possibility_of_passing -= decreased_possibility
          
          possibility_of_passing = 1
        return data

class cicleProbability(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, Y):
        data = X.copy()
        data_to_search = Y.copy()
        rows = data.shape[0]
        new_cicle = []
        for row in range(0,rows):
          new_cicle.append(data_to_search.loc[(data_to_search[self.columns[0]] == data.loc[row][self.columns[0]])][data.loc[row][self.columns[1]]].values[0])
        data[self.columns[1]] = pd.Series(new_cicle)
        return data

class compressHomework(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        data = X.copy()
        data['%_tareas_entregadas'] = (data[self.columns[0]]) / data[self.columns[1]]
        return data

In [None]:
new_scaler = ScalerOrdenMaterias()

new_scaler.fit(X=df_orden_materias)

df_new_orden_materias = pd.DataFrame.from_records(
    data=new_scaler.transform(
        X=df_orden_materias
    ),
    columns=df_orden_materias.columns
)

df_new_orden_materias.tail()

In [None]:
cicle_transform = cicleProbability(columns=['reducido','ciclo'])
compress_tareas = compressHomework(columns=['Tareas_Puntuales','Total_Tareas'])

cicle_transform.fit(X=df2)
compress_tareas.fit(X=df2)

df3 = pd.DataFrame.from_records(
    data=compress_tareas.transform(
        X=cicle_transform.transform(
            X=df2,
            Y=df_new_orden_materias
        )
    ),  
    columns=np.append(df2.columns.to_numpy(), ['%_tareas_entregadas'], axis=None)
)

df3.head()

### Eliminando columnas no desadas

In [None]:
df3.tail()

In [163]:
df4 = df3.drop(columns=['studentId', 'reducido', 'Tareas_Puntuales', 'Tareas_No_Entregadas', 'Tareas_Retrasadas', 'Total_Tareas'], inplace=False)

In [None]:
df4.tail()

<hr>

## Entrenando un clasificador basado  en un Árbol de Decisión

### Seleccionando FEATURES y definiendo la variable TARGET

In [None]:
df4.columns

In [166]:
features = df4[
    [
       'ciclo', 'Calificacion_Promedio', 'Dias_Conectado',
       'Minutos_Promedio', 'Minutos_Total', '%_tareas_entregadas'
    ]
]
target = df4['Graduado']

### Dividiendo nuestro dataset en set de Entrenamiento y Pruebas

In [167]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=133)

### Entrenando un modelo ``DecisionTreeClassifier()``

In [168]:
dtc = RandomForestClassifier(bootstrap=True,
                             ccp_alpha=0.0,
                             class_weight= None,
                             criterion='gini',
                             max_depth=8,
                             max_features='auto',
                             max_leaf_nodes=None,
                             max_samples=None,
                             min_impurity_decrease=0.0,
                             min_impurity_split=None,
                             min_samples_leaf=2,
                             min_samples_split=2,
                             min_weight_fraction_leaf=0.0,
                             n_estimators=1000,
                             n_jobs=None,
                             oob_score=False,
                             random_state=0,
                             verbose=0,
                             warm_start=False).fit(X_train, y_train)

### Haciendo predicciones del Sample Test

In [169]:
y_pred = dtc.predict(X_test)

### Analice la calidad del modelo a través de la matriz de confusión

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
group_names = ['`Positivo` Correto', '`Negativo` Errado', 'Falso `Positivo`', '`Negativo` Correto']
group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)
accuracy  = np.trace(cf_matrix) / float(np.sum(cf_matrix))
precision = cf_matrix[1,1] / sum(cf_matrix[:,1])
recall    = cf_matrix[1,1] / sum(cf_matrix[1,:])
f1_score  = 2*precision*recall / (precision + recall)
sns.heatmap(cf_matrix, annot=labels, fmt="")
stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={}".format(accuracy, precision, recall, f1_score)
plt.ylabel('True label')
plt.xlabel('Predicted label' + stats_text)
plt.show()

<hr>

## Scoring de la data requerida para hacer la entrega de la solución

In [None]:
!wget --no-check-certificate --content-disposition https://raw.githubusercontent.com/vanderlei-test/dataset2/master/for_submission/ToBePredicted.csv
df_to_be_predicted = pd.read_csv(r'ToBePredicted.csv')
df_to_be_predicted.tail()

In [None]:
# Uniendo los dataset
df = pd.merge(
    df_to_be_predicted, df_tabla_tareas, how='inner',
    on=None, left_on=['studentId', 'ciclo'], right_on=['studentId', 'ciclo'],
    left_index=False, right_index=False, sort=False,
    suffixes=('_x', '_y'), copy=True, indicator=False,
    validate=None
)
df = pd.merge(
    df, df_tabla_conexiones, how='inner',
    on=None, left_on=['studentId', 'ciclo'], right_on=['studentId', 'ciclo'],
    left_index=False, right_index=False, sort=False,
    suffixes=('_x', '_y'), copy=True, indicator=False,
    validate=None
)

#preprocesando
##imputando
impute_zeros.fit(X=df)

df2 = pd.DataFrame.from_records(
    data=impute_zeros.transform(
        X=df
    ),
    columns=df.columns
)
##manejando variables categoricas
cicle_transform.fit(X=df2)
compress_tareas.fit(X=df2)

df3 = pd.DataFrame.from_records(
    data=compress_tareas.transform(
        X=cicle_transform.transform(
            X=df2,
            Y=df_new_orden_materias
        )
    ),  
    columns=np.append(df2.columns.to_numpy(), ['%_tareas_entregadas'], axis=None)
)

# Eliminando la columna 'reducido'
df4 = df3.drop(columns=['studentId', 'reducido', 'Tareas_Puntuales', 'Tareas_No_Entregadas', 'Tareas_Retrasadas', 'Total_Tareas'], inplace=False)

df4.tail()

In [None]:
y_pred = dtc.predict(df4[
    [
       'ciclo', 'Calificacion_Promedio', 'Dias_Conectado',
       'Minutos_Promedio', 'Minutos_Total', '%_tareas_entregadas'
    ]
])
print(y_pred)