In [None]:
#!pip install pandas_profiling

In [43]:
# Imports
import warnings

warnings.simplefilter(action='ignore')

from copy import copy, deepcopy
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.model_selection import LeaveOneOut, cross_val_score, train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, make_scorer, f1_score, recall_score, matthews_corrcoef, precision_score, classification_report, confusion_matrix
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
import pandas_profiling
import category_encoders as ce
import xgboost as xgb

## 1. Preparing the data

In [2]:
#Importing the data
CSD2016 = pd.read_csv('CDS_2016_va', encoding = 'latin-1')
CSD2017 = pd.read_csv('CDS_2017_va', encoding = 'latin-1')
CSD2018 = pd.read_csv('CDS_2018_va', encoding = 'latin-1')
CSD2019 = pd.read_csv('CDS_2019_NO_LABEL', encoding = 'latin-1')
clientes = pd.read_csv("Clientes.csv")

In [3]:
#Renaming columns for all dataframes
CSD2016.rename(columns={'Año Natural': 'Año_Natural', 'Tipo Material Educativo': 'Tipo_Material_Educativo', 'Grupo Editorial': 'Grupo_Editorial', 'Tipo Soporte Actual': 'Tipo_Soporte_Actual', 'Variable 1': 'Variable_1', 'Variable 2': 'Variable_2'}, inplace=True)
CSD2019.rename(columns={'Año natural': 'Año_Natural', 'Tipo Material Educativo': 'Tipo_Material_Educativo', 'Grupo Editorial': 'Grupo_Editorial', 'Tipo Soporte Actual': 'Tipo_Soporte_Actual', 'Variable1': 'Variable_1', 'Variable2': 'Variable_2'}, inplace=True)
CSD2017.columns = CSD2016.columns
CSD2018.columns = CSD2016.columns
clientes.rename(columns={'Comunidad Autónoma': 'Comunidad_Autónoma'}, inplace=True)

In [4]:
#Cleaning variables from 2018 dataset
CSD2018["Curso"] = CSD2018["Curso"].str.replace("c","")
CSD2018 = CSD2018.replace({"Año_Natural": 18}, {"Año_Natural": 2018}, regex=True)
CSD2018["Curso"] = CSD2018["Curso"].astype('int64')
CSD2019["Variable_2"] = CSD2019["Variable_2"] * 100

In [5]:
#Creating a unique identifier for each record
CSD2016["Unique_Id"] = (CSD2016["Id_Cliente"].astype(str) + CSD2016["Curso"].astype(str) + CSD2016["Asignatura"].astype(str) + CSD2016["Tipo_Material_Educativo"].astype(str) + CSD2016["Lengua"].astype(str) + CSD2016["Tipo_Soporte_Actual"].astype(str))
CSD2017["Unique_Id"] = (CSD2017["Id_Cliente"].astype(str) + CSD2017["Curso"].astype(str) + CSD2017["Asignatura"].astype(str) + CSD2017["Tipo_Material_Educativo"].astype(str) + CSD2017["Lengua"].astype(str) + CSD2017["Tipo_Soporte_Actual"].astype(str))
CSD2018["Unique_Id"] = (CSD2018["Id_Cliente"].astype(str) + CSD2018["Curso"].astype(str) + CSD2018["Asignatura"].astype(str) + CSD2018["Tipo_Material_Educativo"].astype(str) + CSD2018["Lengua"].astype(str) + CSD2018["Tipo_Soporte_Actual"].astype(str))
CSD2019["Unique_Id"] = (CSD2019["Id_Cliente"].astype(str) + CSD2019["Curso"].astype(str) + CSD2019["Asignatura"].astype(str) + CSD2019["Tipo_Material_Educativo"].astype(str) + CSD2019["Lengua"].astype(str) + CSD2019["Tipo_Soporte_Actual"].astype(str))

In [6]:
#Checking for 2016
CSD2016.duplicated().sum()

0

In [7]:
#Checking for 2017
CSD2017.duplicated().sum()

30712

In [8]:
#Checking for 2018
CSD2018.duplicated().sum()

0

In [9]:
#Checking for 2019
CSD2019.duplicated().sum()

0

In [10]:
#Dropping duplicates
CSD2017.sort_values("Unique_Id", inplace=True)
CSD2017 = CSD2017.drop_duplicates()

In [11]:
#Changing Grupo Editiorial as string
CSD2016["Grupo_Editorial"] = CSD2016["Grupo_Editorial"].astype("str")
CSD2017["Grupo_Editorial"] = CSD2017["Grupo_Editorial"].astype("str")
CSD2018["Grupo_Editorial"] = CSD2018["Grupo_Editorial"].astype("str")

In [12]:
#Defining a function to return the number of years of the course
def courseyears(s):
    if (pd.isnull(s["Años_Curso_PY"]) == True):
        return 1
    else:
        return s["Años_Curso_PY"] + 1

In [13]:
#Adding column "Groupo_Editorial" from the previous Year and the Number of year of the course
CSD2016["Años_Curso"] = 1

CSD2016_1 = CSD2016[["Unique_Id","Grupo_Editorial","Años_Curso"]]
CSD2016_1.columns = ["Unique_Id","Grupo_Editorial_PY","Años_Curso_PY"]
CSD2017 = pd.merge(CSD2017, CSD2016_1, how = "left", on='Unique_Id')
CSD2017["Años_Curso"] = CSD2017.apply(courseyears, axis=1)
CSD2017 = CSD2017.drop(["Años_Curso_PY"], axis = 1)

CSD2017_1 = CSD2017[["Unique_Id","Grupo_Editorial","Años_Curso"]]
CSD2017_1.columns = ["Unique_Id","Grupo_Editorial_PY","Años_Curso_PY"]
CSD2018 = pd.merge(CSD2018, CSD2017_1,how = "left", on='Unique_Id')
CSD2018["Años_Curso"] = CSD2018.apply(courseyears, axis=1)
CSD2018 = CSD2018.drop(["Años_Curso_PY"], axis = 1)

CSD2018_1 = CSD2018[["Unique_Id","Grupo_Editorial","Años_Curso"]]
CSD2018_1.columns = ["Unique_Id","Grupo_Editorial_PY","Años_Curso_PY"]
CSD2019 = pd.merge(CSD2019, CSD2018_1, how = "left", on='Unique_Id')
CSD2019["Años_Curso"] = CSD2019.apply(courseyears, axis=1)
CSD2019 = CSD2019.drop(["Años_Curso_PY"], axis = 1)

print("Number of courses in 2016: " + CSD2016["Unique_Id"].count().astype(str))
print("Number of courses in 2017: " + CSD2017["Unique_Id"].count().astype(str))
print("Number of courses in 2018: " + CSD2018["Unique_Id"].count().astype(str))
print("Number of courses in 2019: " + CSD2019["Unique_Id"].count().astype(str))

Number of courses in 2016: 612727
Number of courses in 2017: 615630
Number of courses in 2018: 619854
Number of courses in 2019: 617860


In [14]:
#Concatenating the dataframes
AllCDS = pd.concat([CSD2016, CSD2017, CSD2018], axis = 0, join = 'outer', ignore_index = False)
print("Number of courses in All Years: " + AllCDS["Unique_Id"].count().astype(str))

Number of courses in All Years: 1848211


In [15]:
#Defining function to return the type of change from one year to another 
def changes(df):
    if (pd.isnull(df["Grupo_Editorial_PY"]) == True) and (df["Grupo_Editorial"] == "1") :
        return "New_course_SM"
    elif (pd.isnull(df["Grupo_Editorial_PY"]) == True) and (df["Grupo_Editorial"] == "90") :
        return "New_course_No-Use"
    elif (pd.isnull(df["Grupo_Editorial_PY"]) == True) and (df["Grupo_Editorial"] != "1") and (df["Grupo_Editorial"] != "90"):
        return "New_course_Editorial"
    elif (df["Grupo_Editorial_PY"] == "1" ) and (df["Grupo_Editorial"] == "1" ):
        return "SM_to_SM"
    elif (df["Grupo_Editorial_PY"] != "1" ) and (df["Grupo_Editorial_PY"] != "90" ) and (df["Grupo_Editorial"] != "1") and (df["Grupo_Editorial"] != "90"):
        return "Editorial_to_Editorial"
    elif (df["Grupo_Editorial_PY"] == "90" ) and (df["Grupo_Editorial"] == "90"):
        return "No-Use_to_No-Use"
    elif (df["Grupo_Editorial_PY"] == "1" ) and (df["Grupo_Editorial"] != "1" ) and (df["Grupo_Editorial"] != "90" ):
        return "SM_to_Editorial"
    elif (df["Grupo_Editorial_PY"] != "1" ) and (df["Grupo_Editorial"] == "1" ) and (df["Grupo_Editorial"] != "90" ):
        return "Editorial_to_SM"
    elif (df["Grupo_Editorial_PY"] == "90" ) and (df["Grupo_Editorial"] == "1"):
        return "No-Use_to_SM"
    elif (df["Grupo_Editorial_PY"] == "90" ) and (df["Grupo_Editorial"] != "1") and (df["Grupo_Editorial"] != "90"):
        return "No-Use_to_Editorial"
    elif (df["Grupo_Editorial_PY"] == "1" ) and (df["Grupo_Editorial"] == "90"):
        return "SM_to_No-Use"
    elif (df["Grupo_Editorial_PY"] != "1" ) and (df["Grupo_Editorial_PY"] != "90" ) and (df["Grupo_Editorial"] == "90"):
        return "Editorial_to_No-Use"
    else:
        return "FAIL"

In [16]:
#Creating a new column applying the function already created
AllCDS["Change"] = AllCDS.apply(changes, axis=1)

In [17]:
#Listing the changes from one year to another by type
AllCDS["Change"].value_counts()

Editorial_to_Editorial    572905
New_course_Editorial      478953
No-Use_to_No-Use          291535
New_course_No-Use         228419
SM_to_SM                  105810
New_course_SM             105106
Editorial_to_No-Use        33053
No-Use_to_Editorial        14240
SM_to_No-Use                6926
Editorial_to_SM             5966
SM_to_Editorial             5298
Name: Change, dtype: int64

When looking all years we can observe: 
- 30% of the courses reimain using educational material from one year to another from editorials that are not SM. 
- 25% of the courses are new courses that use material from other editorials. 
- 15% of the courses have not use any material for at least two consequtive years (considering the present year).
- 12% are **new courses that do not use any material.** 
- When **strictly looking the change "To No Use",** only 2.16% of the courses are in that category, **where SM represent the 0.3% of that total.**

**Because of that, for the target variable we are going to define the change "To No-Use" including:**
- Courses with no use of any material for two consequtive years.
- New courses that did not decide to use any material.
The above because capturing those courses implies to capture a business oportunity for SM.

In [18]:
AllCDS[AllCDS["Año_Natural"]==2018]["Change"].value_counts()

Editorial_to_Editorial    286360
No-Use_to_No-Use          153262
New_course_Editorial       53926
SM_to_SM                   51327
New_course_No-Use          29335
Editorial_to_No-Use        15483
New_course_SM              14225
No-Use_to_Editorial         6626
SM_to_No-Use                3334
SM_to_Editorial             3061
Editorial_to_SM             2915
Name: Change, dtype: int64

In [19]:
# Defining a function to return the name of the "Grupo_Editorial"
def grupoeditorialnames(df):
    if (pd.isnull(df["Grupo_Editorial_PY"]) == True):
        return "New_Course"
    elif (df["Grupo_Editorial_PY"] == "1"):
        return "SM"
    elif (df["Grupo_Editorial_PY"] == "90"):
        return "No-Use"
    else:
        return "Editorial"

In [20]:
# Applying the function to both dataframes and creating a new column with the names of the "Grupo_Editorial"
AllCDS["Grupo_Editorial_Nombre"] = AllCDS.apply(grupoeditorialnames, axis=1)
CSD2019["Grupo_Editorial_Nombre"] = CSD2019.apply(grupoeditorialnames, axis=1)

In [21]:
AllCDS["Grupo_Editorial_Nombre"].value_counts()

New_Course    812478
Editorial     609924
No-Use        307775
SM            118034
Name: Grupo_Editorial_Nombre, dtype: int64

In [22]:
CSD2019["Grupo_Editorial_Nombre"].value_counts()

Editorial     304421
No-Use        178203
New_Course     81022
SM             54214
Name: Grupo_Editorial_Nombre, dtype: int64

In [23]:
# Defining the function to return 1 or 0 in the target variable according to the type of change
def targetvariable(df):
    if (df["Change"]) in {"No-Use_to_No-Use","SM_to_No-Use","Editorial_to_No-Use", "New_course_No-Use"}:
        return 1
    else:
        return 0

In [24]:
# Applying the function to create the target variable
AllCDS["Target"] = AllCDS.apply(targetvariable, axis=1)

In [25]:
# Checking the change to no-use for 2017
AllCDS[AllCDS["Año_Natural"] == 2017]["Target"].value_counts()

0    433324
1    182306
Name: Target, dtype: int64

In [26]:
# Checking the change to no-use for 2018
AllCDS[AllCDS["Año_Natural"] == 2018]["Target"].value_counts()

0    421774
1    198080
Name: Target, dtype: int64

In [27]:
# Creating a new column to capture the change of previous year (Target)
CDS2016_2 = AllCDS[AllCDS["Año_Natural"]==2016]

CDS2017_2 = AllCDS[AllCDS["Año_Natural"]==2017]
temp = AllCDS[AllCDS["Año_Natural"]==2016][["Unique_Id", "Target"]]
temp.columns= ["Unique_Id", "Change_PY"]
CDS2017_2 = pd.merge(CDS2017_2, temp, on="Unique_Id", how = "left")
 
CDS2018_2 = AllCDS[AllCDS["Año_Natural"]==2018]
temp = AllCDS[AllCDS["Año_Natural"]==2017][["Unique_Id", "Target"]]
temp.columns= ["Unique_Id", "Change_PY"]
CDS2018_2 = pd.merge(CDS2018_2, temp, on="Unique_Id", how = "left")

AllCDS = pd.concat([CDS2016_2, CDS2017_2, CDS2018_2], axis=0, join='outer', ignore_index=False)

temp = AllCDS[AllCDS["Año_Natural"]==2018][["Unique_Id", "Target"]]
temp.columns= ["Unique_Id", "Change_PY"]
CSD2019 = pd.merge(CSD2019, temp, how = "left", on='Unique_Id')

In [28]:
# Merging the dataframes to add information of the schools
AllData = pd.merge(AllCDS, clientes, on="Id_Cliente", how = "left")
AllCSD2019 = pd.merge(CSD2019, clientes, on="Id_Cliente", how = "left")

In [67]:
# Reordering columns
AllData = AllData[['Unique_Id','Id_Cliente', 'Año_Natural', 'Años_Curso','Curso','Asignatura','Tipo_Material_Educativo','Lengua','Tipo_Soporte_Actual', 'Variable_1','Variable_2','Latitud','Longitud', 'Comunidad_Autónoma', 'Id_Asociación', 'Id_Subasociación', 'Titularidad', 'Grupo_Editorial','Grupo_Editorial_PY', 'Grupo_Editorial_Nombre', 'Change', 'Change_PY','Target']]
AllCSD2019 = AllCSD2019[['Unique_Id','Id_Cliente', 'Año_Natural','Años_Curso','Curso','Asignatura','Tipo_Material_Educativo','Lengua','Tipo_Soporte_Actual', 'Variable_1','Variable_2','Latitud','Longitud', 'Comunidad_Autónoma', 'Id_Asociación', 'Id_Subasociación', 'Titularidad','Grupo_Editorial_PY', 'Grupo_Editorial_Nombre','Change_PY']]

# 2. Creating the Model

### Defining the functions

In [31]:
Classification_results = {}

def evaluates(X_train, X_test, y_train, y_test, estimator,Report=False):
    train_scores=[]
    test_scores=[]

    estimator.fit(X_train, y_train)
    train_metric = cross_val_score(estimator, X_train, y_train, cv=10, scoring="f1")
    train_scores.append(np.median(train_metric))
        
    y_pred = estimator.predict(X_test)
    test_score = f1_score(y_test, y_pred)
    test_scores.append(test_score)
        
    if Report is True:
        print(classification_report(y_test,y_pred))
        print(confusion_matrix(y_test,y_pred))
    
    return train_scores, test_scores

## 2.1. Baseline

In [32]:
#Making a copy of the original data
AllTrain = copy(AllData)

In [33]:
#Dropping some columns
AllTrain = AllTrain.drop(["Unique_Id", "Change", "Grupo_Editorial", "Id_Asociación", "Grupo_Editorial_PY", "Id_Cliente"], axis = 1)

In [34]:
#Checking for missing values
AllTrain.isnull().sum()

Año_Natural                      0
Años_Curso                       0
Curso                            0
Asignatura                       0
Tipo_Material_Educativo          0
Lengua                           0
Tipo_Soporte_Actual              0
Variable_1                       0
Variable_2                       0
Latitud                          0
Longitud                         0
Comunidad_Autónoma               0
Id_Subasociación           1399275
Titularidad                      0
Grupo_Editorial_Nombre           0
Change_PY                   812478
Target                           0
dtype: int64

In [35]:
#Replacing missing values in Subasociación
AllTrain["Id_Subasociación"].fillna("901", inplace = True)
AllTrain["Change_PY"].fillna("0", inplace = True)

In [36]:
#Converting some columns to categorical variables
AllTrain["Curso"] = AllTrain["Curso"].astype(str)
AllTrain["Asignatura"] = AllTrain["Asignatura"].astype(str)
AllTrain["Tipo_Material_Educativo"] = AllTrain["Tipo_Material_Educativo"].astype(str)
AllTrain["Lengua"] = AllTrain["Lengua"].astype(str)
AllTrain["Tipo_Soporte_Actual"] = AllTrain["Tipo_Soporte_Actual"].astype(str)
AllTrain["Id_Subasociación"] = AllTrain["Id_Subasociación"].astype(str)
AllTrain["Change_PY"] = AllTrain["Change_PY"].astype(str)

In [37]:
#Dropping data of the first year
AllTrain = AllTrain.drop(AllTrain[AllTrain["Año_Natural"] == 2016].index)

In [38]:
#Binnary encoding
encodeCol = AllTrain.columns[AllTrain.dtypes==object].tolist()
Binary = ce.BinaryEncoder(cols=encodeCol)
AllTrainTemp = Binary.fit_transform(AllTrain[encodeCol])
AllTrain = pd.concat([AllTrain,AllTrainTemp], axis=1)
AllTrain = AllTrain.drop(encodeCol, axis=1)

In [39]:
#Splitting the data
splits = np.array([0.8, 0.2])

#Shuffle your input
AllTrain = AllTrain.sample(frac=1)

#Split into 2 parts
AllTrainTrain, AllTrainTest = np.array_split(
    AllTrain, (splits[:-1].cumsum() * len(AllTrain)).astype(int))

In [40]:
#Creating features and target variables for the train
x_train = AllTrainTrain.loc[:, AllTrainTrain.columns != 'Target']
y_train = AllTrainTrain['Target']

#Creating features and target variables for the test
x_test = AllTrainTest.loc[:, AllTrainTest.columns != 'Target']
y_test = AllTrainTest['Target']

In [41]:
#Transforming to dataframe
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
x_test = pd.DataFrame(x_test)
y_test = pd.DataFrame(y_test)

In [42]:
#Defining the model
tree = DecisionTreeClassifier(random_state=0, max_depth=15) 

#Evaluating the model
tree_train, tree_test =evaluates(x_train, x_test, y_train, y_test, tree, Report = True)

              precision    recall  f1-score   support

           0       0.94      0.97      0.95    170769
           1       0.92      0.85      0.88     76328

    accuracy                           0.93    247097
   macro avg       0.93      0.91      0.92    247097
weighted avg       0.93      0.93      0.93    247097

[[164808   5961]
 [ 11325  65003]]


In [44]:
#Getting the best features so we can build on top of that
pd.concat((pd.DataFrame(x_train.columns, columns = ['variable']), 
           pd.DataFrame(tree.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:15]

Unnamed: 0,variable,importance
48,Change_PY_2,0.825527
1,Años_Curso,0.041882
22,Tipo_Material_Educativo_2,0.023035
5,Longitud,0.012727
4,Latitud,0.012237
3,Variable_2,0.012109
45,Grupo_Editorial_Nombre_2,0.010425
14,Asignatura_2,0.007384
17,Asignatura_5,0.006456
18,Asignatura_6,0.00528


## 2.2. Outliers

In [45]:
#Defining the baseline
def baselinedataset(data):
    data = copy(AllData)
    data = data.drop(["Unique_Id", "Change", "Grupo_Editorial", "Id_Asociación", "Grupo_Editorial_PY", "Id_Cliente"], axis = 1)
    data["Id_Subasociación"].fillna("901", inplace = True)
    data["Change_PY"].fillna("0", inplace = True)
    data["Curso"] = data["Curso"].astype(str)
    data["Asignatura"] = data["Asignatura"].astype(str)
    data["Tipo_Material_Educativo"] = data["Tipo_Material_Educativo"].astype(str)
    data["Lengua"] = data["Lengua"].astype(str)
    data["Tipo_Soporte_Actual"] = data["Tipo_Soporte_Actual"].astype(str)
    data["Id_Subasociación"] = data["Id_Subasociación"].astype(str)
    data["Change_PY"] = data["Change_PY"].astype(str)
    data = data.drop(data[data["Año_Natural"] == 2016].index)
    return data

In [46]:
#Making a copy of the original data
AllTrain = copy(AllData)

In [47]:
#Making the previous transformations
AllTrain = baselinedataset(AllTrain)

In [48]:
#Setting Latitude and Longitude as categorical so they are not detected as outliers
AllTrain["Latitud"] = AllTrain["Latitud"].astype(str)
AllTrain["Longitud"] = AllTrain["Longitud"].astype(str)

In [49]:
#Detecting outliers
Q1 = AllTrain.quantile(0.25)
Q3 = AllTrain.quantile(0.75)
IQR = Q3 - Q1

((AllTrain < (Q1 - 1.5 * IQR)) |(AllTrain > (Q3 + 1.5 * IQR))).sum()

Asignatura                     0
Año_Natural                    0
Años_Curso                     0
Change_PY                      0
Comunidad_Autónoma             0
Curso                          0
Grupo_Editorial_Nombre         0
Id_Subasociación               0
Latitud                        0
Lengua                         0
Longitud                       0
Target                         0
Tipo_Material_Educativo        0
Tipo_Soporte_Actual            0
Titularidad                    0
Variable_1                 64966
Variable_2                 60995
dtype: int64

In [50]:
#Removing outliers
AllTrain = AllTrain[~((AllTrain < (Q1 - 1.5 * IQR)) |(AllTrain > (Q3 + 1.5 * IQR))).any(axis=1)]

In [51]:
#Setting Laitude and Longitud as float again
AllTrain["Latitud"] = AllTrain["Latitud"].astype(float)
AllTrain["Longitud"] = AllTrain["Longitud"].astype(float)

In [52]:
#Binnary encoding
encodeCol = AllTrain.columns[AllTrain.dtypes==object].tolist()
Binary = ce.BinaryEncoder(cols=encodeCol)
AllTrainTemp = Binary.fit_transform(AllTrain[encodeCol])
AllTrain = pd.concat([AllTrain,AllTrainTemp], axis=1)
AllTrain = AllTrain.drop(encodeCol, axis=1)

In [53]:
#Splitting the data
splits = np.array([0.8, 0.2])

#Shuffle your input
AllTrain = AllTrain.sample(frac=1)

#Split into 2 parts
AllTrainTrain, AllTrainTest = np.array_split(
    AllTrain, (splits[:-1].cumsum() * len(AllTrain)).astype(int))

In [54]:
#Creating features and target variables for the train
x_train = AllTrainTrain.loc[:, AllTrainTrain.columns != 'Target']
y_train = AllTrainTrain['Target']

#Creating features and target variables for the test
x_test = AllTrainTest.loc[:, AllTrainTest.columns != 'Target']
y_test = AllTrainTest['Target']

In [55]:
#Transforming to dataframe
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
x_test = pd.DataFrame(x_test)
y_test = pd.DataFrame(y_test)

In [56]:
#Defining the model
tree = DecisionTreeClassifier(random_state=0, max_depth=15) 

#Evaluating the model
tree_train, tree_test =evaluates(x_train, x_test, y_train, y_test, tree, Report = True)

              precision    recall  f1-score   support

           0       0.94      0.96      0.95    156992
           1       0.92      0.86      0.89     72832

    accuracy                           0.93    229824
   macro avg       0.93      0.91      0.92    229824
weighted avg       0.93      0.93      0.93    229824

[[151219   5773]
 [ 10369  62463]]


In [57]:
#Getting the best features so we can build on top of that
pd.concat((pd.DataFrame(x_train.columns, columns = ['variable']), 
           pd.DataFrame(tree.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:15]

Unnamed: 0,variable,importance
48,Change_PY_2,0.819346
1,Años_Curso,0.043671
22,Tipo_Material_Educativo_2,0.02332
5,Longitud,0.013707
4,Latitud,0.012774
3,Variable_2,0.010782
45,Grupo_Editorial_Nombre_2,0.010651
19,Asignatura_7,0.007132
17,Asignatura_5,0.006487
15,Asignatura_3,0.006469


## 2.3. Feature Engineering

In [58]:
#Defining the baseline
def outliersdataset(data):
    data = copy(AllData)
    data = data.drop(["Unique_Id", "Change", "Grupo_Editorial", "Id_Asociación", "Grupo_Editorial_PY", "Id_Cliente"], axis = 1)
    data["Id_Subasociación"].fillna("901", inplace = True)
    data["Change_PY"].fillna("0", inplace = True)
    data["Curso"] = data["Curso"].astype(str)
    data["Asignatura"] = data["Asignatura"].astype(str)
    data["Tipo_Material_Educativo"] = data["Tipo_Material_Educativo"].astype(str)
    data["Lengua"] = data["Lengua"].astype(str)
    data["Tipo_Soporte_Actual"] = data["Tipo_Soporte_Actual"].astype(str)
    data["Id_Subasociación"] = data["Id_Subasociación"].astype(str)
    data["Change_PY"] = data["Change_PY"].astype(str)
    data = data.drop(data[data["Año_Natural"] == 2016].index)
    data["Latitud"] = data["Latitud"].astype(str)
    data["Longitud"] = data["Longitud"].astype(str)
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
    data["Latitud"] = data["Latitud"].astype(float)
    data["Longitud"] = data["Longitud"].astype(float)
    return data

In [59]:
#Making a copy of the original data
AllTrain = copy(AllData)

In [60]:
#Making the previous transformations
AllTrain = outliersdataset(AllTrain)

### 2.3.1. Creating new columns

In [61]:
#Creating a function to return categories of the courses
def gruposcurso(df):    
    if np.isin(df["Curso"], ["20", "21", "22", "23", "24","25"]):
            return "Infantil"
    elif np.isin(df["Curso"], ["26", "27", "28", "29", "30", "31"]):
            return "Primaria"
    elif np.isin(df["Curso"], ["32", "33", "34", "35"]):
            return "Secundaria"
    elif np.isin(df["Curso"], ["36", "37"]):
            return "Bachillerato"
    else:
         return "Misc"

In [62]:
#Creating a new column with the groups of Curso
AllTrain["Grupo_Curso"] = AllTrain.apply(gruposcurso, axis=1)

In [63]:
#Creating a new column Variable 3 using Variable 1 and 2 to get the value of each 
AllTrain["Variable_3"] = AllTrain['Variable_2'] / AllTrain['Variable_1']

In [64]:
#Checking for nulls in the new variable
AllTrain.isnull().sum()

Año_Natural                   0
Años_Curso                    0
Curso                         0
Asignatura                    0
Tipo_Material_Educativo       0
Lengua                        0
Tipo_Soporte_Actual           0
Variable_1                    0
Variable_2                    0
Latitud                       0
Longitud                      0
Comunidad_Autónoma            0
Id_Subasociación              0
Titularidad                   0
Grupo_Editorial_Nombre        0
Change_PY                     0
Target                        0
Grupo_Curso                   0
Variable_3                 3372
dtype: int64

In [65]:
#Replacing outliers for 0
AllTrain["Variable_3"].fillna(0, inplace = True)

In [66]:
#Binnary encoding
encodeCol = AllTrain.columns[AllTrain.dtypes==object].tolist()
Binary = ce.BinaryEncoder(cols=encodeCol)
AllTrainTemp = Binary.fit_transform(AllTrain[encodeCol])
AllTrain = pd.concat([AllTrain,AllTrainTemp], axis=1)
AllTrain = AllTrain.drop(encodeCol, axis=1)

In [68]:
#Splitting the data
splits = np.array([0.8, 0.2])

#Shuffle your input
AllTrain = AllTrain.sample(frac=1)

#Split into 2 parts
AllTrainTrain, AllTrainTest = np.array_split(
    AllTrain, (splits[:-1].cumsum() * len(AllTrain)).astype(int))

In [69]:
#Creating features and target variables for the train
x_train = AllTrainTrain.loc[:, AllTrainTrain.columns != 'Target']
y_train = AllTrainTrain['Target']

#Creating features and target variables for the test
x_test = AllTrainTest.loc[:, AllTrainTest.columns != 'Target']
y_test = AllTrainTest['Target']

In [70]:
#Defining the model
tree = DecisionTreeClassifier(random_state=0, max_depth=15) 

#Evaluating the model
tree_train, tree_test =evaluates(x_train, x_test, y_train, y_test, tree, Report = True)

              precision    recall  f1-score   support

           0       0.93      0.97      0.95    156364
           1       0.92      0.86      0.89     73460

    accuracy                           0.93    229824
   macro avg       0.93      0.91      0.92    229824
weighted avg       0.93      0.93      0.93    229824

[[151205   5159]
 [ 10629  62831]]


In [71]:
#Getting the best features so we can build on top of that
pd.concat((pd.DataFrame(x_train.columns, columns = ['variable']), 
           pd.DataFrame(tree.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:15]

Unnamed: 0,variable,importance
49,Change_PY_2,0.815985
1,Años_Curso,0.042878
23,Tipo_Material_Educativo_2,0.022936
6,Variable_3,0.017269
4,Latitud,0.014024
5,Longitud,0.012097
15,Asignatura_2,0.011321
46,Grupo_Editorial_Nombre_2,0.010534
3,Variable_2,0.005989
27,Tipo_Soporte_Actual_0,0.005053


### 2.3.2. Optimizing the depth of the tree

In [72]:
min_depth = 1
max_depth = 30
parameters = {'max_depth':range(min_depth, max_depth)}
grid_tree = GridSearchCV(DecisionTreeClassifier(), scoring='f1', 
                         param_grid=parameters)
grid_tree.fit(x_train, y_train)
best_tree = grid_tree.best_estimator_


print('Optimal F1: {:.4f}, for max_depth={}'.format(
    grid_tree.best_score_, grid_tree.best_params_['max_depth']))

Optimal F1: 0.8901, for max_depth=19


In [73]:
#Defining the model with the optimal depth
tree = DecisionTreeClassifier(random_state=0, max_depth=19) 

#Evaluating the model
tree_train, tree_test =evaluates(x_train, x_test, y_train, y_test, tree, Report = True)

              precision    recall  f1-score   support

           0       0.94      0.97      0.95    156364
           1       0.92      0.86      0.89     73460

    accuracy                           0.93    229824
   macro avg       0.93      0.91      0.92    229824
weighted avg       0.93      0.93      0.93    229824

[[151129   5235]
 [ 10085  63375]]


In [74]:
#Getting the best features so we can build on top of that
pd.concat((pd.DataFrame(x_train.columns, columns = ['variable']), 
           pd.DataFrame(tree.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:15]

Unnamed: 0,variable,importance
49,Change_PY_2,0.76355
1,Años_Curso,0.040806
4,Latitud,0.030451
5,Longitud,0.027437
23,Tipo_Material_Educativo_2,0.021462
6,Variable_3,0.0204
3,Variable_2,0.013657
15,Asignatura_2,0.010756
46,Grupo_Editorial_Nombre_2,0.009858
2,Variable_1,0.00814


When optimizing the depth of the tree the model does not improves, therefore we decide to try a different algorithm to see if we can capture more changes to no-use in the courses

## 2.4. Randome Forest

In [75]:
#Defining the final transformations
def featuredataset(data):
    data = copy(AllData)
    data = data.drop(["Unique_Id", "Change", "Grupo_Editorial", "Id_Asociación", "Grupo_Editorial_PY", "Id_Cliente"], axis = 1)
    data["Id_Subasociación"].fillna("901", inplace = True)
    data["Change_PY"].fillna("0", inplace = True)
    data["Curso"] = data["Curso"].astype(str)
    data["Asignatura"] = data["Asignatura"].astype(str)
    data["Tipo_Material_Educativo"] = data["Tipo_Material_Educativo"].astype(str)
    data["Lengua"] = data["Lengua"].astype(str)
    data["Tipo_Soporte_Actual"] = data["Tipo_Soporte_Actual"].astype(str)
    data["Id_Subasociación"] = data["Id_Subasociación"].astype(str)
    data["Change_PY"] = data["Change_PY"].astype(str)
    data = data.drop(data[data["Año_Natural"] == 2016].index)
    data["Latitud"] = data["Latitud"].astype(str)
    data["Longitud"] = data["Longitud"].astype(str)
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
    data["Latitud"] = data["Latitud"].astype(float)
    data["Longitud"] = data["Longitud"].astype(float)
    data["Grupo_Curso"] = data.apply(gruposcurso, axis=1)
    data["Variable_3"] = data['Variable_2'] / data['Variable_1']
    data["Variable_3"].fillna(0, inplace = True)
    return data

In [76]:
#Making a copy of the original data
AllTrain = copy(AllData)

In [77]:
#Making the previous transformations
AllTrain = featuredataset(AllTrain)

In [78]:
#Binnary encoding
encodeCol = AllTrain.columns[AllTrain.dtypes==object].tolist()
Binary = ce.BinaryEncoder(cols=encodeCol)
AllTrainTemp = Binary.fit_transform(AllTrain[encodeCol])
AllTrain = pd.concat([AllTrain,AllTrainTemp], axis=1)
AllTrain = AllTrain.drop(encodeCol, axis=1)

In [79]:
#Splitting the data
splits = np.array([0.8, 0.2])

#Shuffle your input
AllTrain = AllTrain.sample(frac=1)

#Split into 2 parts
AllTrainTrain, AllTrainTest = np.array_split(
    AllTrain, (splits[:-1].cumsum() * len(AllTrain)).astype(int))

In [80]:
#Creating features and target variables for the train
x_train = AllTrainTrain.loc[:, AllTrainTrain.columns != 'Target']
y_train = AllTrainTrain['Target']

#Creating features and target variables for the test
x_test = AllTrainTest.loc[:, AllTrainTest.columns != 'Target']
y_test = AllTrainTest['Target']

In [81]:
#Defining the model again but this time using Random Forest
rf = RandomForestClassifier(criterion= 'gini',
                                n_estimators=500, 
                                max_features='auto',
                                oob_score = True, 
                                random_state=1,
                                n_jobs = -1)

#Using the evaluate function to evaluate the model over our data
rf_train, rf_test = evaluates(x_train, x_test, y_train, y_test, rf, Report = True)

              precision    recall  f1-score   support

           0       0.94      0.97      0.96    156825
           1       0.93      0.88      0.90     72999

    accuracy                           0.94    229824
   macro avg       0.93      0.92      0.93    229824
weighted avg       0.94      0.94      0.94    229824

[[151665   5160]
 [  9103  63896]]


## 2.5. XGBoost

In [83]:
#Making a copy of the original data
AllTrain = copy(AllData)

In [84]:
#Making the previous transformations
AllTrain = featuredataset(AllTrain)

In [85]:
#Binnary encoding
encodeCol = AllTrain.columns[AllTrain.dtypes==object].tolist()
Binary = ce.BinaryEncoder(cols=encodeCol)
AllTrainTemp = Binary.fit_transform(AllTrain[encodeCol])
AllTrain = pd.concat([AllTrain,AllTrainTemp], axis=1)
AllTrain = AllTrain.drop(encodeCol, axis=1)

In [86]:
#Splitting the data
splits = np.array([0.8, 0.2])

#Shuffle your input
AllTrain = AllTrain.sample(frac=1)

#Split into 2 parts
AllTrainTrain, AllTrainTest = np.array_split(
    AllTrain, (splits[:-1].cumsum() * len(AllTrain)).astype(int))

In [87]:
#Creating features and target variables for the train
x_train = AllTrainTrain.loc[:, AllTrainTrain.columns != 'Target']
y_train = AllTrainTrain['Target']

#Creating features and target variables for the test
x_test = AllTrainTest.loc[:, AllTrainTest.columns != 'Target']
y_test = AllTrainTest['Target']

In [88]:
#Defining the model again but this time using XGBoost
xgb = xgb.XGBClassifier(learning_rate=0.1,n_estimators=100,
                                random_state=0,
                                  max_depth=20,
                                n_jobs=-1)

xgb_train, xgb_test = evaluates(x_train, x_test, y_train, y_test, xgb, Report = True)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96    156411
           1       0.94      0.90      0.92     73413

    accuracy                           0.95    229824
   macro avg       0.95      0.94      0.94    229824
weighted avg       0.95      0.95      0.95    229824

[[152095   4316]
 [  7515  65898]]


Considering the results from this model, we decide that this is the final model we are going to use to predict.

The model has a F1 score of 92%. 
- 94% of precision that means that for the courses I predicted to be No-Use (1), my model fail to correctly predict 6% of the cases.
- 92% of recall that basically indicates that my model fails to capture 8% of the courses that actually become No-Use. 

This model was trained considering the following transformations on the data:
- Handling N/As
- Dropping outliers
- Feature Engineering to reduce dimensionality (Groups of Educational Levels for the courses)
- Feature Creation to capture changes to No-Use of previous years and to understand the diferent kind of changes of the educational material year to year
- Binnary Encoding to reduce dimensionality in variables with high cardinality.

Probably there is still so much room to improve the model going deep in the feature engineering.

The model probably can be improve with the use of location variables. Also with more feature creation making use of the different information regarding the school, etc.

Finally, considering that we are dealing with a pretty unbalanced data, undersampling could also be use to capture the strict change to no use in the prediction.

# 3. Final Predicition

In [117]:
#Defining the final transformations for our training 
def finaltranformationtrain():
    data = copy(AllData)
    data = data.drop(["Unique_Id", "Change", "Grupo_Editorial", "Id_Asociación", "Grupo_Editorial_PY", "Id_Cliente"], axis = 1)
    data["Id_Subasociación"].fillna("901", inplace = True)
    data["Change_PY"].fillna("0", inplace = True)
    data["Curso"] = data["Curso"].astype(str)
    data["Asignatura"] = data["Asignatura"].astype(str)
    data["Tipo_Material_Educativo"] = data["Tipo_Material_Educativo"].astype(str)
    data["Lengua"] = data["Lengua"].astype(str)
    data["Tipo_Soporte_Actual"] = data["Tipo_Soporte_Actual"].astype(str)
    data["Id_Subasociación"] = data["Id_Subasociación"].astype(str)
    data["Change_PY"] = data["Change_PY"].astype(str)
    data = data.drop(data[data["Año_Natural"] == 2016].index)
    data["Latitud"] = data["Latitud"].astype(str)
    data["Longitud"] = data["Longitud"].astype(str)
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
    data["Latitud"] = data["Latitud"].astype(float)
    data["Longitud"] = data["Longitud"].astype(float)
    data["Grupo_Curso"] = data.apply(gruposcurso, axis=1)
    data["Variable_3"] = data['Variable_2'] / data['Variable_1']
    data["Variable_3"].fillna(0, inplace = True)
    return data

In [118]:
#Defining the final transformations for our test 
def finaltransformationtest():
    data = copy(AllCSD2019)
    data = data.drop(["Unique_Id", "Id_Asociación", "Grupo_Editorial_PY", "Id_Cliente"], axis = 1)
    data["Id_Subasociación"].fillna("901", inplace = True)
    data["Change_PY"].fillna("0", inplace = True)
    data["Curso"] = data["Curso"].astype(str)
    data["Asignatura"] = data["Asignatura"].astype(str)
    data["Tipo_Material_Educativo"] = data["Tipo_Material_Educativo"].astype(str)
    data["Lengua"] = data["Lengua"].astype(str)
    data["Tipo_Soporte_Actual"] = data["Tipo_Soporte_Actual"].astype(str)
    data["Id_Subasociación"] = data["Id_Subasociación"].astype(str)
    data["Change_PY"] = data["Change_PY"].astype(str)
    data = data.drop(data[data["Año_Natural"] == 2016].index)
    data["Grupo_Curso"] = data.apply(gruposcurso, axis=1)
    data["Variable_3"] = data['Variable_2'] / data['Variable_1']
    data["Variable_3"].fillna(0, inplace = True)
    return data

In [119]:
#Transforming both train and test
TransformedTrain = finaltranformationtrain()
TransformedTest = finaltransformationtest()

In [120]:
#Separating my target variables from the features
TransformedTrainFeatures = TransformedTrain.drop(["Target"], axis = 1)
TransformedTrainTarget = TransformedTrain["Target"]

In [121]:
#Joining both dataframes to make the binnary encoding
FinalCSD = pd.concat([TransformedTrainFeatures, TransformedTest], axis=0, join='outer', ignore_index=False)

In [122]:
#Binnary encoding
encodeCol = FinalCSD.columns[FinalCSD.dtypes==object].tolist()
Binary = ce.BinaryEncoder(cols=encodeCol)
FinalCSDTemp = Binary.fit_transform(FinalCSD[encodeCol])
FinalCSD = pd.concat([FinalCSD,FinalCSDTemp], axis=1)
FinalCSD = FinalCSD.drop(encodeCol, axis=1)

TBETrainFeatures = FinalCSD[FinalCSD["Año_Natural"] != 2019]
TBETest = FinalCSD[FinalCSD["Año_Natural"] == 2019]

In [123]:
#Predicting

#Training the model with all the data
xgb.fit(TBETrainFeatures, TransformedTrainTarget)

#Predicting
y_pred = xgb.predict(TBETest)

In [124]:
CSD2019["Target"] = y_pred
CSD2019.to_csv('CDS_2019_va.csv',index=False)