In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import category_encoders as ce
import xgboost as xgb
pd.options.display.max_columns = None

In [2]:
df_0 = pd.read_csv('./students-all.csv')
df_0 = df_0.drop(columns='Unnamed: 0')

In [7]:
df = df_0.copy()
df['romantic'] = np.where(df['romantic']=='yes', 1, 0)
y = df[['romantic']]
X = df.drop(columns='romantic')
X.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,major
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,4,3,4,1,1,3,6,5,6,6,mat
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,5,3,3,1,1,3,4,5,5,6,mat
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,4,3,2,2,3,3,10,7,8,10,mat
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15,mat
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,4,3,2,1,2,5,4,6,10,10,mat


In [5]:
encoder = ce.HelmertEncoder(cols =['school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'famrel', 'freetime', 'goout', 'Dalc',
       'health', 'major'])

In [8]:
df_encoded = encoder.fit_transform(df)
df_encoded.columns

Index(['intercept', 'school_0', 'sex_0', 'age', 'address_0', 'famsize_0',
       'Pstatus_0', 'Medu_0', 'Medu_1', 'Medu_2', 'Medu_3', 'Fedu_0', 'Fedu_1',
       'Fedu_2', 'Fedu_3', 'Mjob_0', 'Mjob_1', 'Mjob_2', 'Mjob_3', 'Fjob_0',
       'Fjob_1', 'Fjob_2', 'Fjob_3', 'reason_0', 'reason_1', 'reason_2',
       'guardian_0', 'guardian_1', 'traveltime_0', 'traveltime_1',
       'traveltime_2', 'studytime_0', 'studytime_1', 'studytime_2',
       'failures_0', 'failures_1', 'failures_2', 'schoolsup_0', 'famsup_0',
       'paid_0', 'activities_0', 'nursery_0', 'higher_0', 'internet_0',
       'romantic', 'famrel_0', 'famrel_1', 'famrel_2', 'famrel_3',
       'freetime_0', 'freetime_1', 'freetime_2', 'freetime_3', 'goout_0',
       'goout_1', 'goout_2', 'goout_3', 'Dalc_0', 'Dalc_1', 'Dalc_2', 'Dalc_3',
       'Walc', 'health_0', 'health_1', 'health_2', 'health_3', 'absences',
       'G1', 'G2', 'G3', 'major_0'],
      dtype='object')

In [10]:
def train_fast(df):
    
    df = df.sample(frac=1).reset_index(drop=True)
    train = df.iloc[1:round(0.7*len(df)),:]
    validation = df.iloc[round(0.7*len(df)):round(0.85*len(df)),:]
    test = df.iloc[round(0.85*len(df)):,:]
    label_train = train.romantic
    label_val = validation.romantic
    label_test = test.romantic

    df_train = train.drop(["romantic"], axis = 1)
    df_test  = test.drop(["romantic"], axis = 1)
    df_val  = validation.drop(["romantic"], axis = 1)
    dtrain = xgb.DMatrix(data = df_train, label=label_train)
    dval = xgb.DMatrix(data = df_val, label=label_val)
    dtest = xgb.DMatrix(data = df_test, label=label_val)
    param = {'max_depth': 12, 'eta': 1, 'objective': 'binary:logistic', 'eval_metric':'ndcg'}
    evallist = [(dval, 'eval'), (dtrain, 'train')]
    num_round = 200
    bst = xgb.train(param, dtrain, num_round, evallist)
    ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
    ret = sum(np.round(ypred) == label_test)/label_test.size
    
    return ret, bst

In [11]:
acc , bst = train_fast(df_dummy)

[0]	eval-ndcg:0.74373	train-ndcg:0.98381
[1]	eval-ndcg:0.78749	train-ndcg:0.99824
[2]	eval-ndcg:0.82278	train-ndcg:0.99986
[3]	eval-ndcg:0.83140	train-ndcg:0.99999
[4]	eval-ndcg:0.81082	train-ndcg:1.00000
[5]	eval-ndcg:0.87302	train-ndcg:1.00000
[6]	eval-ndcg:0.87764	train-ndcg:1.00000
[7]	eval-ndcg:0.87545	train-ndcg:1.00000
[8]	eval-ndcg:0.87371	train-ndcg:1.00000
[9]	eval-ndcg:0.86285	train-ndcg:1.00000
[10]	eval-ndcg:0.86408	train-ndcg:1.00000
[11]	eval-ndcg:0.87232	train-ndcg:1.00000
[12]	eval-ndcg:0.87412	train-ndcg:1.00000
[13]	eval-ndcg:0.87433	train-ndcg:1.00000
[14]	eval-ndcg:0.87592	train-ndcg:1.00000
[15]	eval-ndcg:0.88086	train-ndcg:1.00000
[16]	eval-ndcg:0.88491	train-ndcg:1.00000
[17]	eval-ndcg:0.88780	train-ndcg:1.00000
[18]	eval-ndcg:0.88866	train-ndcg:1.00000
[19]	eval-ndcg:0.89061	train-ndcg:1.00000
[20]	eval-ndcg:0.89009	train-ndcg:1.00000
[21]	eval-ndcg:0.89001	train-ndcg:1.00000
[22]	eval-ndcg:0.88634	train-ndcg:1.00000
[23]	eval-ndcg:0.88671	train-ndcg:1.00000
[2

[194]	eval-ndcg:0.88456	train-ndcg:1.00000
[195]	eval-ndcg:0.88441	train-ndcg:1.00000
[196]	eval-ndcg:0.88499	train-ndcg:1.00000
[197]	eval-ndcg:0.88512	train-ndcg:1.00000
[198]	eval-ndcg:0.88531	train-ndcg:1.00000
[199]	eval-ndcg:0.88509	train-ndcg:1.00000


In [12]:
acc

0.7579617834394905

In [13]:
df = pd.get_dummies(df_0, drop_first=True)

In [14]:
def train_fast(df):
    
    df = df.sample(frac=1).reset_index(drop=True)
    train = df.iloc[1:round(0.7*len(df)),:]
    validation = df.iloc[round(0.7*len(df)):round(0.85*len(df)),:]
    test = df.iloc[round(0.85*len(df)):,:]
    label_train = train.romantic_yes
    label_val = validation.romantic_yes
    label_test = test.romantic_yes

    df_train = train.drop(["romantic_yes"], axis = 1)
    df_test  = test.drop(["romantic_yes"], axis = 1)
    df_val  = validation.drop(["romantic_yes"], axis = 1)
    dtrain = xgb.DMatrix(data = df_train, label=label_train)
    dval = xgb.DMatrix(data = df_val, label=label_val)
    dtest = xgb.DMatrix(data = df_test, label=label_val)
    param = {'max_depth': 12, 'eta': 1, 'objective': 'binary:logistic', 'eval_metric':'ndcg'}
    evallist = [(dval, 'eval'), (dtrain, 'train')]
    num_round = 200
    bst = xgb.train(param, dtrain, num_round, evallist)
    ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)
    ret = sum(np.round(ypred) == label_test)/label_test.size
    
    return ret, bst

acc , bst = train_fast(df)
acc

[0]	eval-ndcg:0.86535	train-ndcg:0.97971
[1]	eval-ndcg:0.89491	train-ndcg:0.99571
[2]	eval-ndcg:0.83925	train-ndcg:0.99916
[3]	eval-ndcg:0.88733	train-ndcg:0.99993
[4]	eval-ndcg:0.88908	train-ndcg:1.00000
[5]	eval-ndcg:0.88620	train-ndcg:1.00000
[6]	eval-ndcg:0.89407	train-ndcg:1.00000
[7]	eval-ndcg:0.89961	train-ndcg:1.00000
[8]	eval-ndcg:0.91051	train-ndcg:1.00000
[9]	eval-ndcg:0.90777	train-ndcg:1.00000
[10]	eval-ndcg:0.90282	train-ndcg:1.00000
[11]	eval-ndcg:0.90298	train-ndcg:1.00000
[12]	eval-ndcg:0.91389	train-ndcg:1.00000
[13]	eval-ndcg:0.90937	train-ndcg:1.00000
[14]	eval-ndcg:0.90721	train-ndcg:1.00000
[15]	eval-ndcg:0.91354	train-ndcg:1.00000
[16]	eval-ndcg:0.90805	train-ndcg:1.00000
[17]	eval-ndcg:0.90704	train-ndcg:1.00000
[18]	eval-ndcg:0.90388	train-ndcg:1.00000
[19]	eval-ndcg:0.90206	train-ndcg:1.00000
[20]	eval-ndcg:0.90673	train-ndcg:1.00000
[21]	eval-ndcg:0.89741	train-ndcg:1.00000
[22]	eval-ndcg:0.89532	train-ndcg:1.00000
[23]	eval-ndcg:0.90660	train-ndcg:1.00000
[2

[194]	eval-ndcg:0.91654	train-ndcg:1.00000
[195]	eval-ndcg:0.91638	train-ndcg:1.00000
[196]	eval-ndcg:0.91588	train-ndcg:1.00000
[197]	eval-ndcg:0.91582	train-ndcg:1.00000
[198]	eval-ndcg:0.91646	train-ndcg:1.00000
[199]	eval-ndcg:0.91649	train-ndcg:1.00000


0.6751592356687898