In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, recall_score, precision_score, plot_precision_recall_curve, mean_squared_error, r2_score
import seaborn as sns
import category_encoders as ce
#import xgboost
from sklearn.neural_network import MLPClassifier
np.random.seed(123)


import xgboost as xgb
pd.options.display.max_columns = None
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn import metrics

Wczytanie danych

In [188]:
df = pd.read_csv("students-all.csv")
# remove rownames
df = df.iloc[:,1:]
df['romantic'] = np.where(df['romantic']=='yes', 1, 0)
df_dummies = pd.get_dummies(df, drop_first=True)

Zmodyfikowana fun encodująca dane. Beware: target musi być wynikiem encodingu, jeśli romantic znajduje się jako kolumna do zenkodowania -> "romantic_yes"/"romantic_1" itp, dlatego lepiej zamienić "romanitc" ifelsem ręcznie na 0 i 1 dać jako target.

In [187]:
def train_fast(df, max_depth, target, encoder = None):
    
    """
    From Standard cross validation differs in data partitnion. Due to optimizing model on validation set,
    we get final AUC score from equally big test set.
    """
    
    # shuffling index
    df = df.sample(frac=1).reset_index(drop=True)
    n = len(df)
    
    # 5 fold cross validation
    kf = KFold(n_splits=5)
        
    
    ret = []
    tpr_arr = []
    fpr_arr = []
    
    # we will make test 10% and validation 10%
    for train_index, test_index in kf.split(df):
        
        train = df.iloc[train_index,:]
        if encoder:
            encoder.fit(train, train[target])
            train = encoder.transform(train)
        
        v_ind = test_index[0:(len(test_index)//2)] 
        t_ind = test_index[(len(test_index)//2):len(test_index)]
        
        validation = df.iloc[v_ind,:]
        if encoder: validation = encoder.transform(validation)
        
        test = df.iloc[t_ind,:]
        if encoder: test = encoder.transform(test)
        
        label_train = train[target]
        label_val = validation[target]
        label_test = test[target]

        df_train = train.drop([target], axis = 1)
        df_test  = test.drop([target], axis = 1)
        df_val  = validation.drop([target], axis = 1)
        
        dtrain = xgb.DMatrix(data = df_train, label=label_train)
        dval = xgb.DMatrix(data = df_val, label=label_val)
        dtest = xgb.DMatrix(data = df_test, label=label_val)
        
        param = {'max_depth': max_depth, 'objective': 'binary:logistic', 'eval_metric':'auc'}
        
        evallist = [(dval, 'eval'), (dtrain, 'train')]
        
        num_round = 100
        
        bst = xgb.train(param, dtrain, num_round, evallist, verbose_eval=0)
        y_pred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

        fpr, tpr, thresholds = metrics.roc_curve(label_test.to_numpy(), y_pred)

        
        ret.append(metrics.auc(fpr, tpr))
        
        fpr_arr.append(fpr)
        tpr_arr.append(tpr)
        
    # fpr and tpr arrays are problematic, we will get them from median AUC score
    # note that we are returning mean auc score, so it is not ideal 
    index = np.where(np.mean(np.array(ret))== np.mean(np.array(ret)))[0][0]
    
    
    return np.array(ret).mean(), bst, fpr_arr[index],tpr_arr[index]

In [192]:
auc, m, _,_ = train_fast(df_dummies, 19, "romantic")

In [195]:
X = df_dummies.drop(['romantic'], axis = 1)
X = xgb.DMatrix(X)
y = df_dummies['romantic']

In [197]:
y_pred = m.predict(X)

In [198]:
roc_auc_score(y, y_pred)

0.9939723569486109

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 66)

In [29]:
def measure_encoder(encoder, model):
    """
    Funkcja działająca jako pipeline w następujących krokach:
    * kodowanie
    * dopasowanie
    * liczenie rmse i r2
    """
    #logreg = LogisticRegression(*args)
    
    X_train, X_test, y_train, y_test = train_test_split(df.drop(['romantic'], axis = 1),y, test_size = 0.3, random_state = 66)
    
    encoder.fit(X_train, y_train)
    
    X_train = encoder.transform(X_train)
    X_test = encoder.transform(X_test)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    #mse = mean_squared_error(y_test, y_pred)
    #rmse = np.sqrt(mse)
    #r2 = r2_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    return auc

In [22]:
columns= ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'major']

In [53]:
enc = [ce.BackwardDifferenceEncoder(cols=columns),
ce.BaseNEncoder(cols=columns),
ce.BinaryEncoder(cols=columns),
ce.CatBoostEncoder(cols=columns),
ce.HashingEncoder(cols=columns),
ce.HelmertEncoder(cols=columns),
ce.JamesSteinEncoder(cols=columns),
ce.LeaveOneOutEncoder(cols=columns),
ce.MEstimateEncoder(cols=columns),
ce.OneHotEncoder(cols=columns),
ce.OrdinalEncoder(cols=columns),
ce.SumEncoder(cols=columns),
ce.PolynomialEncoder(cols=columns),
ce.TargetEncoder(cols=columns),
ce.WOEEncoder(cols=columns)]

In [54]:
def measure_encodesrs(model):
    res = pd.DataFrame()
    names = ['BackwardDifferenceEncoder',
            'BaseNEncoder',
            'BinaryEncoder',
            'CatBoostEncoder',
             'HashingEncoder',
             'HelmertEncoder',
             'JamesSteinEncoder',
             'LeaveOneOutEncoder',
             'MEstimateEncoder',
             'OneHotEncoder',
             'OrdinalEncoder',
             'SumEncoder',
             'PolynomialEncoder',
             'TargetEncoder',
             'WOEEncoder'
            ]
    for i in range(len(enc)):
        auc = measure_encoder(enc[i], model)
        res = res.append({"Name": names[i], "auc": auc}, ignore_index = True)
    return res

In [63]:
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
models = [#AdaBoostClassifier(), 
          BaggingClassifier(), 
          ExtraTreesClassifier(),
          RandomForestClassifier(), 
          GradientBoostingClassifier(),
         XGBClassifier(),
    XGBRFClassifier()]

In [64]:
r = []
for m in models:
    print(measure_encodesrs(m))

                         Name       auc
0   BackwardDifferenceEncoder  0.618635
1                BaseNEncoder  0.648426
2               BinaryEncoder  0.655366
3             CatBoostEncoder  0.641973
4              HashingEncoder  0.554234
5              HelmertEncoder  0.644935
6           JamesSteinEncoder  0.639984
7          LeaveOneOutEncoder  0.659300
8            MEstimateEncoder  0.614215
9               OneHotEncoder  0.627564
10             OrdinalEncoder  0.651874
11                 SumEncoder  0.652847
12          PolynomialEncoder  0.663764
13              TargetEncoder  0.628580
14                 WOEEncoder  0.619165
                         Name       auc
0   BackwardDifferenceEncoder  0.700451
1                BaseNEncoder  0.717822
2               BinaryEncoder  0.695014
3             CatBoostEncoder  0.717822
4              HashingEncoder  0.573064
5              HelmertEncoder  0.736165
6           JamesSteinEncoder  0.724275
7          LeaveOneOutEncoder  0.738154


In [125]:
auc_res = []
for e in enc:
    auc,_, _,_ = train_fast(df, 19, 'romantic', e)
    auc_res.append(auc)

In [126]:
pd.DataFrame({"encoder": ['BackwardDifferenceEncoder',
            'BaseNEncoder',
            'BinaryEncoder',
            'CatBoostEncoder',
             'HashingEncoder',
             'HelmertEncoder',
             'JamesSteinEncoder',
             'LeaveOneOutEncoder',
             'MEstimateEncoder',
             'OneHotEncoder',
             'OrdinalEncoder',
             'SumEncoder',
             'PolynomialEncoder',
             'TargetEncoder',
             'WOEEncoder'], 
             "auc": auc_res})

Unnamed: 0,encoder,auc
0,BackwardDifferenceEncoder,0.827959
1,BaseNEncoder,0.82584
2,BinaryEncoder,0.876437
3,CatBoostEncoder,0.79972
4,HashingEncoder,0.648465
5,HelmertEncoder,0.809558
6,JamesSteinEncoder,0.836771
7,LeaveOneOutEncoder,0.811583
8,MEstimateEncoder,0.804641
9,OneHotEncoder,0.837627


In [132]:
auc_res = []
for e in enc:
    auc,_, _,_ = train_fast(df, 19, 'romantic', e)
    auc_res.append(auc)

In [134]:
pd.DataFrame({"encoder": ['BackwardDifferenceEncoder',
            'BaseNEncoder',
            'BinaryEncoder',
            'CatBoostEncoder',
             'HashingEncoder',
             'HelmertEncoder',
             'JamesSteinEncoder',
             'LeaveOneOutEncoder',
             'MEstimateEncoder',
             'OneHotEncoder',
             'OrdinalEncoder',
             'SumEncoder',
             'PolynomialEncoder',
             'TargetEncoder',
             'WOEEncoder'], 
             "auc": auc_res})

Unnamed: 0,encoder,auc
0,BackwardDifferenceEncoder,0.831567
1,BaseNEncoder,0.858728
2,BinaryEncoder,0.829732
3,CatBoostEncoder,0.824604
4,HashingEncoder,0.627361
5,HelmertEncoder,0.85301
6,JamesSteinEncoder,0.817299
7,LeaveOneOutEncoder,0.848909
8,MEstimateEncoder,0.837989
9,OneHotEncoder,0.846375


In [220]:
encoder = ce.OneHotEncoder(cols=columns)
X_train, X_test, y_train, y_test = train_test_split(df.drop(['romantic'], axis=1),df['romantic'], test_size = 0.2, random_state = 666)
X_y_train = X_train.copy()
X_y_train['romantic'] = y_train
X_test_trans = encoder.fit_transform(X_test)

In [221]:
auc,model, _,_ = train_fast(X_y_train, 19, 'romantic', ce.OneHotEncoder(cols=columns))

In [222]:
X_test_trans_xgb = xgb.DMatrix(X_test_trans)
y_pred = model.predict(X_test_trans_xgb)

In [223]:
roc_auc_score(y_test, y_pred)

0.5726495726495726

In [210]:
X_train, X_test, y_train, y_test = train_test_split(df_dummies.drop(['romantic'], axis=1),
                                                    df['romantic'], test_size = 0.2, random_state = 666)
X_y_train = X_train.copy()
X_y_train['romantic'] = y_train
#X_test_trans = encoder.fit_transform(X_test)

In [211]:
auc,model, _,_ = train_fast(X_y_train, 19, 'romantic')

In [212]:
X_test_xgb = xgb.DMatrix(X_test)
y_pred = model.predict(X_test_xgb)

In [213]:
roc_auc_score(y_test, y_pred)

0.756184103811841