# Model Performance Transformations

Lets practice some basic data transformation for ML performance enhancement

In [1]:
# Imports

import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Categorical data analyser

def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

In [3]:
# Weather dataset (https://www.kaggle.com/jsphyg/weather-dataset-rattle-package)

weather = pd.read_csv('../data/weatherAUS.csv')
print(weather.shape)
weather.head()

(145460, 23)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [4]:
col_weather = list(weather.columns)
col_weather

['Date',
 'Location',
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustDir',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday',
 'RainTomorrow']

In [5]:
cat_weather = cat_var(weather, col_weather)
cat_weather

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,Date,3436,"[2008-12-01, 2008-12-02, 2008-12-03, 2008-12-0..."
1,Rainfall,682,"[0.6, 0.0, 1.0, 0.2, 1.4, 2.2, 15.6, 3.6, nan,..."
2,Pressure3pm,550,"[1007.1, 1007.8, 1008.7, 1012.8, 1006.0, 1005...."
3,Pressure9am,547,"[1007.7, 1010.6, 1007.6, 1017.6, 1010.8, 1009...."
4,MaxTemp,506,"[22.9, 25.1, 25.7, 28.0, 32.3, 29.7, 25.0, 26...."
5,Temp3pm,503,"[21.8, 24.3, 23.2, 26.5, 29.7, 28.9, 24.6, 25...."
6,Temp9am,442,"[16.9, 17.2, 21.0, 18.1, 17.8, 20.6, 16.3, 18...."
7,MinTemp,390,"[13.4, 7.4, 12.9, 9.2, 17.5, 14.6, 14.3, 7.7, ..."
8,Evaporation,359,"[nan, 12.0, 14.8, 12.6, 10.8, 11.4, 11.2, 13.0..."
9,Sunshine,146,"[nan, 12.3, 13.0, 13.3, 10.6, 12.2, 8.4, 0.0, ..."


In [6]:
weather.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,143975.0,144199.0,142199.0,82670.0,75625.0,135197.0,143693.0,142398.0,142806.0,140953.0,130395.0,130432.0,89572.0,86102.0,143693.0,141851.0
mean,12.194034,23.221348,2.360918,5.468232,7.611178,40.03523,14.043426,18.662657,68.880831,51.539116,1017.64994,1015.255889,4.447461,4.50993,16.990631,21.68339
std,6.398495,7.119049,8.47806,4.193704,3.785483,13.607062,8.915375,8.8098,19.029164,20.795902,7.10653,7.037414,2.887159,2.720357,6.488753,6.93665
min,-8.5,-4.8,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,977.1,0.0,0.0,-7.2,-5.4
25%,7.6,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6
50%,12.0,22.6,0.0,4.8,8.4,39.0,13.0,19.0,70.0,52.0,1017.6,1015.2,5.0,5.0,16.7,21.1
75%,16.9,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4
max,33.9,48.1,371.0,145.0,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7


## Scaling

Some ML algorithms have problems performing well whenever the data scale differ greatly between features. In those cases scaling the data is your best option.

- [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler)

- [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler)

Try both options and see what happens with performance (i.e.: AUC).

<img src="../images/scaling.png" alt="Drawing" style="width: 500px;"/>

In [7]:
# Uluru weather (numerical features)

weather = weather[weather['Location'].isin(['Uluru'])].reset_index(drop = True)
weather = weather[weather['RainToday'].isin(['No','Yes'])].reset_index(drop = True)
weather = weather[weather['RainTomorrow'].isin(['No','Yes'])]
weather = weather[['MinTemp',
                   'MaxTemp',
                   'Rainfall',
                   'WindSpeed9am',
                   'WindSpeed3pm',
                   'Humidity9am',
                   'Humidity3pm',
                   'Pressure9am',
                   'Pressure3pm',
                   'Temp9am',
                   'Temp3pm',
                   'RainTomorrow']]
weather = weather.dropna().reset_index(drop = True)
col_weather = list(weather.columns)
print(col_weather)
print(weather.shape)
print(weather.describe())
weather.head()

['MinTemp', 'MaxTemp', 'Rainfall', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainTomorrow']
(1479, 12)
           MinTemp      MaxTemp     Rainfall  WindSpeed9am  WindSpeed3pm   
count  1479.000000  1479.000000  1479.000000   1479.000000   1479.000000  \
mean     14.368627    30.402299     0.716700     17.613928     17.050710   
std       7.432857     7.624058     4.208585      7.887082      6.893016   
min      -1.900000    11.300000     0.000000      0.000000      0.000000   
25%       8.100000    23.800000     0.000000     11.000000     11.000000   
50%      14.900000    31.200000     0.000000     17.000000     17.000000   
75%      20.800000    37.100000     0.000000     24.000000     22.000000   
max      31.000000    44.400000    83.800000     41.000000     48.000000   

       Humidity9am  Humidity3pm  Pressure9am  Pressure3pm      Temp9am   
count  1479.000000  1479.000000  1479.000000  1479.000000  1479.0

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainTomorrow
0,19.7,30.0,0.8,30.0,24.0,76.0,54.0,1010.6,1007.5,21.7,28.4,No
1,21.6,33.1,0.0,22.0,11.0,44.0,33.0,1010.5,1006.5,24.6,31.3,No
2,21.3,36.1,0.0,24.0,13.0,39.0,27.0,1006.9,1002.7,27.6,34.5,No
3,22.9,37.7,0.0,28.0,13.0,35.0,22.0,1006.0,1002.1,28.7,35.4,No
4,24.0,39.0,0.0,20.0,19.0,33.0,21.0,1006.9,1003.5,29.9,37.3,No


In [8]:
# Features + target
#features
X = weather[['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm']]
#target
y = pd.get_dummies(weather['RainTomorrow'], drop_first = True)['Yes']
print(X.shape, y.shape)

(1479, 11) (1479,)


In [9]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (1183, 11), X_test: (296, 11), y_train: (1183,), y_test: (296,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [10]:
# Scaling standard
scaler = StandardScaler()
scaling_X_train = scaler.fit_transform(X_train)
scaling_X_test = scaler.fit_transform(X_test)
scaled_X_train = pd.DataFrame(scaling_X_train)
scaled_X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.861467,-1.291213,-0.172964,0.298824,1.021273,1.043379,1.116760,1.408494,1.555798,-1.065644,-1.347235
1,-1.821583,-1.409177,-0.172964,-1.095940,-0.871947,0.814233,-0.052270,1.084717,1.157436,-1.466209,-1.428600
2,1.748425,1.644760,-0.172964,-1.095940,0.001847,-0.881446,-0.606021,-1.705931,-1.753670,1.764156,1.446322
3,-1.808060,-1.461605,-0.172964,0.172028,0.293111,0.722574,0.255369,1.084717,1.050184,-1.427444,-1.591332
4,0.747741,1.041836,-0.172964,1.947182,0.730008,-1.385567,-0.975189,-1.150885,-0.757766,1.454041,0.985250
...,...,...,...,...,...,...,...,...,...,...,...
1178,-0.780330,-0.334401,-0.172964,-1.095940,3.497022,0.676745,0.070786,-0.272062,-0.497298,-0.600471,-0.194553
1179,1.221037,0.923873,-0.172964,-0.081566,-0.580683,-0.285667,0.009258,-1.382154,-1.294022,1.260219,0.971689
1180,-1.104877,-0.164010,-0.172964,-0.588753,-1.600109,-0.194009,-0.606021,0.714686,0.437320,-0.626314,-0.235236
1181,-1.280672,-1.264999,-0.172964,-0.081566,1.021273,-0.194009,-0.790605,1.531837,1.617084,-1.375759,-1.252308


In [11]:
scaled_X_test = pd.DataFrame(scaling_X_test)
scaled_X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.339316,1.515163,-0.168327,0.317912,0.242960,-1.030095,-0.779035,-0.726011,-0.808638,1.703820,1.566997
1,1.497833,1.422887,-0.099698,1.333858,-0.903116,-0.699853,-0.407309,-1.065070,-1.178810,1.485032,1.375546
2,-0.589306,0.051928,-0.168327,-1.459993,-0.330078,0.762650,0.088327,0.242731,0.028274,-0.368225,0.158465
3,-0.206224,1.383340,-0.168327,-1.459993,-0.330078,-1.407515,-1.150762,-1.032779,-0.744260,0.545534,1.225120
4,1.735608,0.223298,-0.168327,-0.571040,1.245777,-1.077273,1.017644,-0.177057,0.076557,0.970239,-0.128712
...,...,...,...,...,...,...,...,...,...,...,...
291,1.537462,1.053782,-0.168327,0.190919,0.386220,-0.086545,0.398099,-1.630169,-1.484605,1.343464,0.896918
292,-1.566827,-1.239938,-0.168327,-0.063068,0.242960,0.715472,0.522008,1.647406,1.573342,-1.397813,-1.222718
293,1.127960,-0.224901,-0.168327,-0.571040,-0.043559,1.847732,2.876278,-0.290077,-0.036104,0.313876,-0.306488
294,0.229698,-0.897198,-0.168327,2.222810,0.672739,-0.086545,0.460054,-0.887467,0.060463,0.017870,-1.195367


In [12]:
# Scaling robust
scaler_r = RobustScaler()
scaling_X_train_r = scaler_r.fit_transform(X_train)
scaling_X_test_r = scaler_r.fit_transform(X_test)
scaled_X_train_r = pd.DataFrame(scaling_X_train_r)
scaled_X_train_r

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.535433,-0.785185,0.0,0.076923,0.636364,0.84375,1.3750,0.948454,1.072917,-0.632727,-0.824903
1,-1.094488,-0.851852,0.0,-0.769231,-0.545455,0.68750,0.1875,0.731959,0.802083,-0.858182,-0.871595
2,0.984252,0.874074,0.0,-0.769231,0.000000,-0.46875,-0.3750,-1.134021,-1.177083,0.960000,0.778210
3,-1.086614,-0.881481,0.0,0.000000,0.181818,0.62500,0.5000,0.731959,0.729167,-0.836364,-0.964981
4,0.401575,0.533333,0.0,1.076923,0.454545,-0.81250,-0.7500,-0.762887,-0.500000,0.785455,0.513619
...,...,...,...,...,...,...,...,...,...,...,...
1178,-0.488189,-0.244444,0.0,-0.769231,2.181818,0.59375,0.3125,-0.175258,-0.322917,-0.370909,-0.163424
1179,0.677165,0.466667,0.0,-0.153846,-0.363636,-0.06250,0.2500,-0.917526,-0.864583,0.676364,0.505837
1180,-0.677165,-0.148148,0.0,-0.461538,-1.000000,0.00000,-0.3750,0.484536,0.312500,-0.385455,-0.186770
1181,-0.779528,-0.770370,0.0,-0.153846,0.636364,0.00000,-0.5625,1.030928,1.114583,-0.807273,-0.770428


In [13]:
scaled_X_test_r = pd.DataFrame(scaling_X_test_r)
scaled_X_test_r

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.730924,0.877339,0.0,0.230769,0.222222,-0.645161,-0.473684,-0.572271,-0.607450,1.017682,0.900862
1,0.827309,0.819127,0.2,0.846154,-0.666667,-0.419355,-0.157895,-0.820059,-0.871060,0.884086,0.780172
2,-0.441767,-0.045738,0.0,-0.846154,-0.222222,0.580645,0.263158,0.135693,-0.011461,-0.247544,0.012931
3,-0.208835,0.794179,0.0,-0.846154,-0.222222,-0.903226,-0.789474,-0.796460,-0.561605,0.310413,0.685345
4,0.971888,0.062370,0.0,-0.307692,1.000000,-0.677419,1.052632,-0.171091,0.022923,0.569745,-0.168103
...,...,...,...,...,...,...,...,...,...,...,...
291,0.851406,0.586279,0.0,0.153846,0.333333,0.000000,0.526316,-1.233038,-1.088825,0.797642,0.478448
292,-1.036145,-0.860707,0.0,0.000000,0.222222,0.548387,0.631579,1.162242,1.088825,-0.876228,-0.857759
293,0.602410,-0.220374,0.0,-0.307692,0.000000,1.322581,2.631579,-0.253687,-0.057307,0.168959,-0.280172
294,0.056225,-0.644491,0.0,1.384615,0.555556,0.000000,0.578947,-0.690265,0.011461,-0.011788,-0.840517


In [14]:
# Linear model
linear_model = LogisticRegression(max_iter = 1000)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC is: {linear_auc}")

Linear model AUC is: 0.7296218886566597


In [15]:
#linear model scaled standard
linear_model = LogisticRegression(max_iter = 1000)
linear_param = linear_model.fit(scaled_X_train, y_train)
linear_pred = linear_model.predict(scaled_X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC scaled standard is: {linear_auc}")

Linear model AUC scaled standard is: 0.6787953638609159


In [16]:
#linear model scaled robust
linear_model = LogisticRegression(max_iter = 1000)
linear_param = linear_model.fit(scaled_X_train_r, y_train)
linear_pred = linear_model.predict(scaled_X_test_r)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC scaled robust is: {linear_auc}")

Linear model AUC scaled robust is: 0.6542846285388563


In [17]:
# Ensemble model
ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Ensemble model AUC is: {ensemble_auc}")

Ensemble model AUC is: 0.6942808284248527


In [18]:
# Ensemble model scaled standard
ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(scaled_X_train, y_train)
ensemble_pred = ensemble_model.predict(scaled_X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Ensemble model AUC scaled standard is: {ensemble_auc}")

Ensemble model AUC scaled standard is: 0.6924757742732283


In [19]:
# Ensemble model scaled robust
ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(scaled_X_train_r, y_train)
ensemble_pred = ensemble_model.predict(scaled_X_test_r)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Ensemble model AUC scaled robust is: {ensemble_auc}")

Ensemble model AUC scaled robust is: 0.6488694660839825


---

## Enconding

ML algorithms do not support categorical data. Therefore you need to find a way to transform categorical data into numerical. You must compare the results using both techniques: __One Hot Encoding__ or __Label Encoding__

- [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)

- [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder)

<img src="../images/encoding.png" alt="Drawing" style="width: 500px;"/>

In [20]:
# Mushrooms dataset (https://www.kaggle.com/uciml/mushroom-classification)
mushrooms = pd.read_csv('../data/mushrooms.csv')
col_mushrooms = list(mushrooms.columns)
print(mushrooms.shape)
mushrooms.head()

(8124, 23)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [39]:
col_mushrooms

['class',
 'cap-shape',
 'cap-surface',
 'cap-color',
 'bruises',
 'odor',
 'gill-attachment',
 'gill-spacing',
 'gill-size',
 'gill-color',
 'stalk-shape',
 'stalk-root',
 'stalk-surface-above-ring',
 'stalk-surface-below-ring',
 'stalk-color-above-ring',
 'stalk-color-below-ring',
 'veil-type',
 'veil-color',
 'ring-number',
 'ring-type',
 'spore-print-color',
 'population',
 'habitat']

In [21]:
# Features analysis
cat_mushrooms = cat_var(mushrooms, col_mushrooms)
cat_mushrooms

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,gill-color,12,"[k, n, g, p, w, h, u, e, b, r, y, o]"
1,cap-color,10,"[n, y, w, g, e, p, b, u, c, r]"
2,spore-print-color,9,"[k, n, u, h, w, r, o, y, b]"
3,odor,9,"[p, a, l, n, f, c, y, s, m]"
4,stalk-color-below-ring,9,"[w, p, g, b, n, e, y, o, c]"
5,stalk-color-above-ring,9,"[w, g, p, n, b, e, o, c, y]"
6,habitat,7,"[u, g, m, d, p, w, l]"
7,cap-shape,6,"[x, b, s, f, k, c]"
8,population,6,"[s, n, a, v, y, c]"
9,ring-type,5,"[p, e, l, f, n]"


In [22]:
"""
# Features + target (encoding). IMPORTANT: you may pick any of the 2-labeled features as you target (choose wisely!!!)
X_columns = mushrooms[['gill-color',
              'cap-color',
              'spore-print-color',
              'odor',
              'stalk-color-below-ring', 
              'stalk-color-above-ring',
             'habitat',
             'cap-shape',
             'cap-shape',
             'population',
             'ring-type',
             'stalk-root',
             'stalk-surface-above-ring',
             'stalk-surface-below-ring',
             'veil-color',
             'cap-surface',
             'ring-number',
             'stalk-shape',
             'gill-size',
             'gill-spacing',
             'gill-attachment',
             'bruises',
             'veil-type']]

y_columns = mushrooms['class']

X = pd.get_dummies(X_columns)
y = pd.get_dummies(y_columns, drop_first = True).astype(int)

y
"""

"\n# Features + target (encoding). IMPORTANT: you may pick any of the 2-labeled features as you target (choose wisely!!!)\nX_columns = mushrooms[['gill-color',\n              'cap-color',\n              'spore-print-color',\n              'odor',\n              'stalk-color-below-ring', \n              'stalk-color-above-ring',\n             'habitat',\n             'cap-shape',\n             'cap-shape',\n             'population',\n             'ring-type',\n             'stalk-root',\n             'stalk-surface-above-ring',\n             'stalk-surface-below-ring',\n             'veil-color',\n             'cap-surface',\n             'ring-number',\n             'stalk-shape',\n             'gill-size',\n             'gill-spacing',\n             'gill-attachment',\n             'bruises',\n             'veil-type']]\n\ny_columns = mushrooms['class']\n\nX = pd.get_dummies(X_columns)\ny = pd.get_dummies(y_columns, drop_first = True).astype(int)\n\ny\n"

In [23]:
# Features + target (encoding). IMPORTANT: you may pick any of the 2-labeled features as you target (choose wisely!!!)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

labelencoder = LabelEncoder()

for i in col_mushrooms: 
    mushrooms[i] = labelencoder.fit_transform(mushrooms[i])

mushrooms

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [24]:
corr = mushrooms.corr()
corr.style.background_gradient(cmap='coolwarm')

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
class,1.0,0.052951,0.178446,-0.031384,-0.50153,-0.093552,0.1292,-0.348387,0.540024,-0.530566,-0.102019,-0.379361,-0.334593,-0.298801,-0.154003,-0.14673,,0.145142,-0.214366,-0.411771,0.171961,0.298686,0.217179
cap-shape,0.052951,1.0,-0.050454,-0.048203,-0.035374,-0.021935,0.078865,0.013196,0.05405,-0.006039,0.063794,0.030191,-0.030417,-0.032591,-0.031659,-0.03039,,0.07256,-0.106534,-0.025457,-0.073416,0.063413,-0.042221
cap-surface,0.178446,-0.050454,1.0,-0.019402,0.070228,0.045233,-0.03418,-0.282306,0.2081,-0.161017,-0.014123,-0.126245,0.08909,0.107965,0.06605,0.068885,,-0.016603,-0.026147,-0.106407,0.230364,0.021555,0.163887
cap-color,-0.031384,-0.048203,-0.019402,1.0,-0.000764,-0.387121,0.041436,0.144259,-0.169464,0.084659,-0.456496,0.321274,-0.060837,-0.04771,0.002364,0.008057,,0.03613,-0.005822,0.162513,-0.293523,-0.14477,0.033925
bruises,-0.50153,-0.035374,0.070228,-0.000764,1.0,-0.061825,0.137359,-0.299473,-0.369596,0.52712,0.099364,0.244188,0.460824,0.458983,0.083538,0.092874,,0.11977,0.056788,0.692973,-0.285008,0.088137,-0.075095
odor,-0.093552,-0.021935,0.045233,-0.387121,-0.061825,1.0,-0.05959,0.063936,0.310495,-0.129213,0.459766,-0.205215,0.118617,0.06182,0.174532,0.169407,,-0.057747,0.111905,-0.281387,0.469055,-0.043623,-0.02661
gill-attachment,0.1292,0.078865,-0.03418,0.041436,0.137359,-0.05959,1.0,0.071489,0.108984,-0.128567,0.186485,0.144063,-0.088916,-0.116177,0.099299,0.09716,,0.897518,0.093236,-0.146689,-0.029524,0.165575,-0.030304
gill-spacing,-0.348387,0.013196,-0.282306,0.144259,-0.299473,0.063936,0.071489,1.0,-0.108333,0.100193,0.080895,0.350548,-0.212359,-0.213775,0.274574,0.253505,,0.073363,0.243014,-0.195897,0.047323,-0.529253,-0.15468
gill-size,0.540024,0.05405,0.2081,-0.169464,-0.369596,0.310495,0.108984,-0.108333,1.0,-0.516736,0.214576,-0.344345,0.05631,0.010894,0.296548,0.278708,,0.103809,-0.171362,-0.460872,0.622991,0.147682,0.161418
gill-color,-0.530566,-0.006039,-0.161017,0.084659,0.52712,-0.129213,-0.128567,0.100193,-0.516736,1.0,-0.175699,0.31508,0.224287,0.257224,-0.058299,-0.074781,,-0.097583,0.096054,0.629398,-0.416135,-0.03409,-0.202972


In [25]:
X = mushrooms[['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type',
       'spore-print-color', 'population', 'habitat']]

y = mushrooms['class']

In [26]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (6499, 21), X_test: (1625, 21), y_train: (6499,), y_test: (1625,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [27]:
# Scaling standard
scaler = StandardScaler()
scaling_X_train = scaler.fit_transform(X_train)
scaling_X_test = scaler.fit_transform(X_test)
scaled_X_train = pd.DataFrame(scaling_X_train)
scaled_X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-0.231089,0.136921,-0.982598,-0.842237,1.357976,0.162899,-0.438473,1.498210,-1.354131,0.869217,...,0.688059,-0.887615,0.099260,0.632627,0.141309,-0.258254,-1.261850,1.419717,0.287342,-0.877177
1,1.019408,0.136921,-0.199676,-0.842237,-1.020476,0.162899,-0.438473,1.498210,-1.354131,0.869217,...,-0.918730,0.593034,0.623297,0.632627,0.141309,-0.258254,-1.261850,1.419717,0.287342,1.443749
2,-0.856337,0.951054,-0.982598,-0.842237,1.833666,0.162899,-0.438473,1.498210,-1.354131,0.869217,...,0.688059,0.593034,0.099260,0.632627,0.141309,-0.258254,-1.261850,1.419717,0.287342,0.283286
3,-0.856337,-1.491346,-0.199676,1.187314,0.406595,0.162899,-0.438473,-0.667463,1.177739,0.869217,...,0.688059,0.593034,-1.472853,0.109809,0.141309,-0.258254,0.954123,-0.255816,0.287342,-0.877177
4,-2.106834,0.951054,1.757630,1.187314,-0.544786,0.162899,-0.438473,-0.667463,-0.228856,-1.150461,...,0.688059,0.593034,0.623297,0.632627,0.141309,-0.258254,0.954123,-0.255816,-1.301290,0.863517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6494,1.019408,0.951054,-0.199676,-0.842237,-1.020476,0.162899,-0.438473,1.498210,-1.354131,0.869217,...,-0.918730,0.593034,0.099260,0.632627,0.141309,-0.258254,-1.261850,1.419717,0.287342,1.443749
6495,-0.231089,0.951054,-0.982598,1.187314,0.406595,0.162899,-0.438473,-0.667463,1.459058,-1.150461,...,0.688059,0.593034,0.623297,-1.981466,0.141309,3.382510,-1.261850,1.419717,-2.095607,2.604212
6496,-0.856337,0.951054,-0.199676,1.187314,-0.544786,0.162899,-0.438473,-0.667463,1.459058,-1.150461,...,0.688059,2.073683,0.623297,0.632627,0.141309,-0.258254,0.954123,-0.255816,1.081659,1.443749
6497,-0.231089,0.136921,-0.982598,-0.842237,-1.020476,0.162899,-0.438473,1.498210,-1.354131,0.869217,...,0.688059,0.593034,0.099260,0.109809,0.141309,-0.258254,-1.261850,1.419717,0.287342,1.443749


In [28]:
scaled_X_test = pd.DataFrame(scaling_X_test)
scaled_X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-0.778772,-1.465717,-0.192479,-0.847207,0.406433,0.162886,2.270529,-0.675347,-0.516865,0.890841,...,0.666732,-2.390245,0.619093,0.629504,0.145841,-0.247625,-1.314966,-0.228965,-0.545182,-0.290838
1,-0.778772,0.152897,-0.989364,-0.847207,1.828803,0.162886,-0.440426,1.480721,-1.378601,0.890841,...,0.666732,0.560114,0.085675,0.098931,0.145841,-0.247625,-1.314966,1.464180,0.272088,0.295529
2,1.072283,0.962204,-0.192479,-0.847207,-1.015937,0.162886,-0.440426,1.480721,-1.378601,0.890841,...,-0.952759,0.560114,0.619093,0.098931,0.145841,-0.247625,-1.314966,1.464180,0.272088,0.295529
3,-0.778772,0.962204,-0.590921,1.180350,0.406433,0.162886,-0.440426,-0.675347,0.057626,0.890841,...,0.666732,0.560114,-1.514579,0.098931,0.145841,-0.247625,0.924300,-0.228965,1.089358,-0.877205
4,-0.778772,0.152897,-0.989364,-0.847207,1.354680,0.162886,-0.440426,1.480721,-1.378601,0.890841,...,0.666732,0.560114,0.085675,0.098931,0.145841,-0.247625,-1.314966,1.464180,0.272088,0.295529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,-0.161754,0.152897,-0.989364,-0.847207,-1.015937,0.162886,-0.440426,1.480721,-1.378601,0.890841,...,-0.952759,0.560114,0.085675,0.629504,0.145841,-0.247625,-1.314966,1.464180,0.272088,1.468263
1621,-0.778772,0.152897,-0.192479,-0.847207,0.406433,0.162886,2.270529,-0.675347,-0.516865,0.890841,...,-2.572251,0.560114,0.619093,0.629504,0.145841,-0.247625,-1.314966,-0.652252,-2.996992,-0.290838
1622,1.072283,0.962204,-0.192479,-0.847207,-1.015937,0.162886,-0.440426,1.480721,-1.378601,0.890841,...,-0.952759,-0.915065,0.085675,0.098931,0.145841,-0.247625,-1.314966,1.464180,0.272088,1.468263
1623,-0.161754,0.962204,-0.192479,-0.847207,1.828803,0.162886,-0.440426,1.480721,-1.378601,0.890841,...,0.666732,-0.915065,0.085675,0.098931,0.145841,-0.247625,-1.314966,1.464180,0.272088,1.468263


In [29]:
# Scaling robust
scaler_r = RobustScaler()
scaling_X_train_r = scaler_r.fit_transform(X_train)
scaling_X_test_r = scaler_r.fit_transform(X_test)
scaled_X_train_r = pd.DataFrame(scaling_X_train_r)
scaled_X_train_r

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.000000,0.000000,-0.4,0.0,0.666667,0.0,0.0,1.0,-1.0,0.0,...,0.0,-1.0,-1.0,0.0,0.0,0.0,-0.5,0.8,0.0,-0.5
1,0.666667,0.000000,0.0,0.0,-1.000000,0.0,0.0,1.0,-1.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,-0.5,0.8,0.0,1.5
2,-0.333333,0.333333,-0.4,0.0,1.000000,0.0,0.0,1.0,-1.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,-0.5,0.8,0.0,0.5
3,-0.333333,-0.666667,0.0,1.0,0.000000,0.0,0.0,0.0,0.8,0.0,...,0.0,0.0,-4.0,-1.0,0.0,0.0,0.5,0.0,0.0,-0.5
4,-1.000000,0.333333,1.0,1.0,-0.666667,0.0,0.0,0.0,-0.2,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,-2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6494,0.666667,0.333333,0.0,0.0,-1.000000,0.0,0.0,1.0,-1.0,0.0,...,-1.0,0.0,-1.0,0.0,0.0,0.0,-0.5,0.8,0.0,1.5
6495,0.000000,0.333333,-0.4,1.0,0.000000,0.0,0.0,0.0,1.0,-1.0,...,0.0,0.0,0.0,-5.0,0.0,1.0,-0.5,0.8,-3.0,2.5
6496,-0.333333,0.333333,0.0,1.0,-0.666667,0.0,0.0,0.0,1.0,-1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.5,0.0,1.0,1.5
6497,0.000000,0.000000,-0.4,0.0,-1.000000,0.0,0.0,1.0,-1.0,0.0,...,0.0,0.0,-1.0,-1.0,0.0,0.0,-0.5,0.8,0.0,1.5


In [30]:
scaled_X_test_r = pd.DataFrame(scaling_X_test_r)
scaled_X_test_r

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,-0.333333,-0.666667,0.0,0.0,0.000000,0.0,1.0,0.0,-0.4,0.0,...,0.0,-2.0,0.0,0.0,0.0,0.0,-1.0,0.0,-1.0,0.0
1,-0.333333,0.000000,-0.4,0.0,1.000000,0.0,0.0,1.0,-1.0,0.0,...,0.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,0.8,0.0,0.5
2,0.666667,0.333333,0.0,0.0,-1.000000,0.0,0.0,1.0,-1.0,0.0,...,-1.0,0.0,0.0,-1.0,0.0,0.0,-1.0,0.8,0.0,0.5
3,-0.333333,0.333333,-0.2,1.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-4.0,-1.0,0.0,0.0,0.0,0.0,1.0,-0.5
4,-0.333333,0.000000,-0.4,0.0,0.666667,0.0,0.0,1.0,-1.0,0.0,...,0.0,0.0,-1.0,-1.0,0.0,0.0,-1.0,0.8,0.0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1620,0.000000,0.000000,-0.4,0.0,-1.000000,0.0,0.0,1.0,-1.0,0.0,...,-1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.8,0.0,1.5
1621,-0.333333,0.000000,0.0,0.0,0.000000,0.0,1.0,0.0,-0.4,0.0,...,-2.0,0.0,0.0,0.0,0.0,0.0,-1.0,-0.2,-4.0,0.0
1622,0.666667,0.333333,0.0,0.0,-1.000000,0.0,0.0,1.0,-1.0,0.0,...,-1.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.8,0.0,1.5
1623,0.000000,0.333333,0.0,0.0,1.000000,0.0,0.0,1.0,-1.0,0.0,...,0.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.8,0.0,1.5


In [31]:
# Linear model
linear_model = LogisticRegression(max_iter = 1000)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC is: {linear_auc}")

Linear model AUC is: 0.9476878945915361


In [32]:
# Linear model standard scaling
linear_model = LogisticRegression(max_iter = 1000)
linear_param = linear_model.fit(scaled_X_train, y_train)
linear_pred = linear_model.predict(scaled_X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC standard scaling is: {linear_auc}")

Linear model AUC standard scaling is: 0.9561849805681207


In [33]:
# Linear model robust scaling
linear_model = LogisticRegression(max_iter = 1000)
linear_param = linear_model.fit(scaled_X_train_r, y_train)
linear_pred = linear_model.predict(scaled_X_test_r)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC robust scaling is: {linear_auc}")

Linear model AUC robust scaling is: 0.9102712878436227


In [34]:
# Ensemble model
ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Ensemble model AUC is: {ensemble_auc}")

Ensemble model AUC is: 1.0


In [35]:
# Ensemble model standard scaling
ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(scaled_X_train, y_train)
ensemble_pred = ensemble_model.predict(scaled_X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Ensemble model AUC standard scaling is: {ensemble_auc}")

Ensemble model AUC standard scaling is: 1.0


In [36]:
# Ensemble model robust scaling
ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(scaled_X_train_r, y_train)
ensemble_pred = ensemble_model.predict(scaled_X_test_r)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Ensemble model AUC robust scaling is: {ensemble_auc}")

Ensemble model AUC robust scaling is: 1.0


---

## Bonus

Now that you can grasp the potential of pre-processing your data...what would you do about the following dataset?

<img src="../images/bonus.jpg" alt="Drawing" style="width: 500px;"/>

In [37]:
# Netflix dataset (https://www.kaggle.com/shivamb/netflix-shows)

netflix = pd.read_csv('../data/netflix_titles.csv')
col_netflix = list(netflix.columns)
print(netflix.shape)
netflix.head()

(7787, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [40]:
col_netflix

['show_id',
 'type',
 'title',
 'director',
 'cast',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'description']

In [38]:
# ML workflow -> ¿what would you do?










---