In [7]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

import catboost as cb
import eli5

In [8]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
SampleSubmission = pd.read_csv('SampleSubmission.csv')

In [9]:
features = []; cat_features = []; not_features = []
for k in train.columns[1:]:
    if train[k].isnull().sum() < 6000:
        features.append(k)
        if train[k].dtype == 'O':
            cat_features.append(k)
            print('There is '+ str(len(train[k].value_counts()))+' Class in: ' +k)
    else:
        not_features.append(k)

print('----------------------------------')
print('We have '+str(len(features)) + ' features')
print('We have '+str(len(cat_features)) + ' categorical features')
print('We have '+str(len(not_features)) + ' features that have more than 6000 of missing values')

There is 254 Class in: child_date
There is 535 Class in: child_enrolment_date
There is 4 Class in: child_grant
There is 4 Class in: child_years_in_programme
There is 4 Class in: child_observe_attentive
There is 4 Class in: child_observe_concentrated
There is 4 Class in: child_observe_diligent
There is 4 Class in: child_observe_interested
There is 2 Class in: child_gender
There is 1018 Class in: child_dob
There is 3 Class in: child_stunted
There is 4 Class in: child_age_group
There is 153 Class in: id_mn_best
There is 10 Class in: prov_best
There is 50 Class in: id_dc_best
There is 50 Class in: dc_best
There is 153 Class in: mn_best
There is 5 Class in: pra_free_play
There is 4 Class in: pra_free_play_outdoor
There is 31 Class in: pra_groupings
There is 2 Class in: pra_groupings_1
There is 2 Class in: pra_groupings_2
There is 2 Class in: pra_groupings_3
There is 2 Class in: pra_groupings_4
There is 2 Class in: pra_groupings_5
There is 3 Class in: pra_engaged
There is 4 Class in: pra_age

In [10]:
train = train[features]
test  = test[features[:-1]]

In [11]:
max(train['target']), min(train['target'])

(96.80999755859376, 6.369999885559082)

In [12]:
train = train.fillna('')
test  = test.fillna('')

In [13]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    train[train.columns[:-1]],
    train[train.columns[-1]],
    test_size = 0.15,
    random_state = 42,
    shuffle = True
)

In [14]:
xtrain.shape, xvalid.shape, ytrain.shape, yvalid.shape

((7297, 276), (1288, 276), (7297,), (1288,))

In [15]:
xtest = test[features[:-1]]
train_dataset = cb.Pool(data = xtrain, label = ytrain, cat_features=cat_features)
val_dataset   = cb.Pool(data = xvalid, label = yvalid, cat_features=cat_features)
model         = cb.CatBoostRegressor(random_seed=123, verbose=300)
model.fit(train_dataset, eval_set=val_dataset, use_best_model=True, early_stopping_rounds=300)
preds_valid = model.predict(xvalid)
preds_test  = model.predict(xtest)
print(np.sqrt(mean_squared_error(yvalid, preds_valid)))

Learning rate set to 0.069568
0:	learn: 14.7538043	test: 15.2395101	best: 15.2395101 (0)	total: 315ms	remaining: 5m 14s
300:	learn: 9.3296665	test: 10.2722845	best: 10.2722845 (300)	total: 1m 3s	remaining: 2m 26s
600:	learn: 8.6998574	test: 10.0931084	best: 10.0924920 (593)	total: 2m 8s	remaining: 1m 25s
900:	learn: 8.1883840	test: 9.9827398	best: 9.9814163 (898)	total: 3m 15s	remaining: 21.5s
999:	learn: 8.0353089	test: 9.9489217	best: 9.9487869 (996)	total: 3m 36s	remaining: 0us

bestTest = 9.948786882
bestIteration = 996

Shrink model to first 997 iterations.
9.94400797842765


In [16]:
import shap
expaliner = shap.TreeExplainer(model)
shap_values = expaliner.shap_values(xtest)

In [17]:
features = xtest.columns

In [18]:
f = {
    'f1' : [], 'f2': [], 'f3': [], 'f4': [],'f5': [],
    'f6' : [], 'f7': [], 'f8': [], 'f9': [],'f10': [],
    'f11' : [], 'f12': [], 'f13': [], 'f14': [],'f15': []
}

In [19]:
for shap_value in shap_values:
    arr = np.argsort(shap_value)[::-1][:15]
    for ind, a in enumerate(arr):
        name_f = f'f{ind+1}'
        f[name_f].append(features[a])

In [20]:
SampleSubmission['target']    = preds_test
SampleSubmission['feature_1'] = f['f1']
SampleSubmission['feature_2'] = f['f2']
SampleSubmission['feature_3'] = f['f3']
SampleSubmission['feature_4'] = f['f4']
SampleSubmission['feature_5'] = f['f5']
SampleSubmission['feature_6'] = f['f6']
SampleSubmission['feature_7'] = f['f7']
SampleSubmission['feature_8'] = f['f8']
SampleSubmission['feature_9'] = f['f9']
SampleSubmission['feature_10'] = f['f10']
SampleSubmission['feature_11'] = f['f11']
SampleSubmission['feature_12'] = f['f12']
SampleSubmission['feature_13'] = f['f13']
SampleSubmission['feature_14'] = f['f14']
SampleSubmission['feature_15'] = f['f15']

In [21]:
SampleSubmission

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,58.008504,child_observe_diligent,child_observe_attentive,teacher_emotional_total,child_observe_total,ses_cat,child_observe_interested,teacher_emotional_independent,teacher_emotional_selfstarter,ses_proxy,child_gender,pri_fees_amount_pv,child_years_in_programme,id_enumerator,teacher_social_peers,child_age
1,ID_GQ6ONJ4FP,45.064949,teacher_emotional_total,child_observe_total,child_observe_attentive,child_observe_diligent,child_observe_concentrated,child_observe_interested,teacher_emotional_independent,pri_attendance,child_gender,teacher_emotional_selfstarter,pri_fees_amount_pv,id_facility,pri_fees_amount,count_children_present,teacher_emotional_confidence
2,ID_YZ76CVRW3,50.703794,id_enumerator,child_observe_diligent,id_prov,id_mn_best,child_observe_attentive,child_date,longitude,prov_best,mn_best,id_team,teacher_emotional_independent,teacher_emotional_confidence,pri_fees_amount_pv,latitude,child_observe_total
3,ID_BNINCRXH8,68.998314,child_observe_diligent,id_enumerator,prov_best,child_observe_concentrated,child_observe_total,child_age,child_observe_attentive,teacher_emotional_total,child_stunted,teacher_emotional_selfstarter,child_years_in_programme,child_observe_interested,pri_language_97,id_mn_best,teacher_social_assistance
4,ID_1U7GDTLRI,45.072941,child_observe_diligent,child_observe_interested,id_team,child_months_enrolment,pri_attendance,longitude,id_prov,child_observe_total,teacher_social_assistance,teacher_emotional_understand,child_observe_attentive,id_ward,count_register_year_2020,teacher_emotional_independent,pri_registered_programme
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,ID_LBPQ2VMQZ,54.419678,child_age,child_age_group,child_observe_interested,child_observe_attentive,child_years_in_programme,teacher_emotional_selfstarter,prov_best,id_prov,id_mn_best,language_assessment,child_date,ses_proxy,teacher_emotional_independent,teacher_emotional_confidence,id_ward
3676,ID_H2RKKMMKK,50.984138,child_age_group,child_age,child_height,id_prov,ward_best,child_date,id_enumerator,id_ward,id_mn_best,child_gender,id_team,pri_meal_2,pri_fees_amount_pv,teacher_emotional_selfstarter,pri_meal
3677,ID_VY8KX7YTZ,44.136604,child_observe_diligent,child_observe_total,child_observe_attentive,child_age,child_observe_concentrated,obs_materials,child_observe_interested,pri_attendance,child_gender,teacher_emotional_confidence,pri_meal_2,pri_meals,teacher_emotional_selfstarter,obs_materials_11,obs_firstaid
3678,ID_EO2MYZ4M7,37.314030,id_enumerator,child_gender,id_mn_best,count_register_year_2020,longitude,language_child,child_zha,mn_best,teacher_emotional_appropriate,pri_meal_2,pri_language_7,id_facility,obs_handwashing_1,count_children_present,obs_water_running


In [22]:
SampleSubmission.to_csv('submission.csv', index=False)
