# Learning

In [1]:
import pandas as pd
import numpy as np

In [6]:
test_df = pd.read_csv('../input/ml-project/test_df.csv')
train_df = pd.read_csv('../input/ml-project/train_df.csv')

In [30]:
from sklearn.ensemble import RandomForestClassifier

embed_columns = [x for x in train_df.columns if x.startswith('embed')]
x_train, y_train = train_df[embed_columns], train_df['higher_education']
x_test, y_test = test_df[embed_columns], test_df['higher_education']



In [None]:
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [46]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_proba= clf.predict_proba(x_test)
y_pred = clf.predict(x_test)
print(f'''accuracy: {clf.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}''')

accuracy: 0.772591262716936 
      f1: 0.8579970104633783, 
      precision: 0.9258064516129032


In [49]:
#did not get good results with these params
# params = {'n_estimators':[40, 70, 100, 150],
#          'max_depth':[ 3, 5, 7],
#          'max_features':[20, 40, 60, 80]}
# clf = RandomForestClassifier()
# gs = GridSearchCV(clf, params, scoring = 'f1', cv = 3, verbose = 3, n_jobs =-1)
# gs.fit(x_train, y_train)

# With CatBoost embedding_features

In [16]:
from catboost import CatBoostClassifier, metrics

In [50]:
CatBoostModel_emb = CatBoostClassifier(
iterations= 500,
learning_rate = 0.05,
use_best_model = True,
eval_metric ='F1', 
#loss_function='Logloss',
random_seed = 42,
logging_level = 'Silent',
embedding_features = ['embeddings'],
depth = 5)

In [51]:
x_train, y_train = train_df[embed_columns], train_df['higher_education']
x_train_emb = pd.DataFrame(np.arange(len(x_train)))
x_train_emb['embeddings'] = x_train.values.tolist()
x_train_emb = x_train_emb.drop([0], axis = 1)
x_val, y_val = test_df[embed_columns], test_df['higher_education']
x_val_emb = pd.DataFrame(np.arange(len(x_val)))
x_val_emb['embeddings'] = x_val.values.tolist()
x_val_emb = x_val_emb.drop([0], axis = 1)

In [52]:
%%time

CatBoostModel_emb.fit(
    x_train_emb, y_train,
    eval_set=(x_val_emb, y_val),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

CPU times: user 10.6 s, sys: 400 ms, total: 11 s
Wall time: 9.86 s


<catboost.core.CatBoostClassifier at 0x7f1d4e26f3d0>

In [53]:
CatBoostModel_emb.get_best_score()

{'learn': {'Logloss': 0.5067433210222069, 'F1': 0.8451495920217589},
 'validation': {'Logloss': 0.5054655069813566, 'F1': 0.8585537918871252}}

# But without embedding_features seems to achieve better results 

In [57]:
x_train, y_train = train_df[embed_columns], train_df['higher_education']
x_test, y_test = test_df[embed_columns], test_df['higher_education']

In [58]:
CatBoostModel = CatBoostClassifier(
iterations= 500,
learning_rate = 0.05,
use_best_model = True,
eval_metric ='F1', 
loss_function='Logloss',
random_seed = 42,
logging_level = 'Silent',
depth = 5)

In [59]:
CatBoostModel.fit(
    x_train, y_train,
    eval_set=(x_test, y_test),
    plot=True
#     logging_level='Verbose',  # you can uncomment this for text output
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f1d4e266450>

In [36]:
CatBoostModel.get_best_score()

{'learn': {'Logloss': 0.3017351258348865, 'F1': 0.9396128286622365},
 'validation': {'Logloss': 0.4799109609095098, 'F1': 0.8614828209764918}}

In [60]:
y_pred = CatBoostModel.predict(x_test)
y_proba = CatBoostModel.predict_proba(x_test)

In [70]:
print(f'''accuracy: {CatBoostModel.score(x_test, y_test)} 
      f1: {f1_score(y_pred, y_test)}, 
      precision: {precision_score(y_pred, y_test)}
     roc auc : {roc_auc_score(y_test, y_proba[:,1])}''')

accuracy: 0.7707959305804907 
      f1: 0.8614828209764918, 
      precision: 0.9604838709677419
     roc auc : 0.7583208592171244
