In [80]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

import catboost as cb
import eli5

In [81]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
SampleSubmission = pd.read_csv('SampleSubmission.csv')

In [82]:
# identify target
target_cols = []
for i in train.columns.values:
    if i not in test.columns.values:
        target_cols.append(i)
        
target = train[target_cols]
train[target_cols].head()
train.drop(['target'],inplace=True,axis=1)


### helper functions

In [83]:
# Helper functions

def group_by_name(df, name, starts_with = False):
    arr = []
    if starts_with:
        for col in df.columns:
            if col.startswith(name):
                arr.append(col)   
    else:
        for col in df.columns:
            if name in col:
                arr.append(col)
        
            
    return arr

def analyse_group(df):
    features = []; cat_features = []; not_features = []
    for k in df.columns:
        features.append(k)
        if df[k].dtype == 'O':
            cat_features.append(k)
            print('There are '+ str(len(df[k].value_counts()))+' Classes in: ' +k)
            print('They are '+ str(df[k].unique()))
        else:
            not_features.append(k)

    print('----------------------------------')
    print('We have '+str(len(features)) + ' features')
    print('We have '+str(len(cat_features)) + ' categorical features')
    print('We have '+str(len(not_features)) + ' numerical features')

def count_class(data):
    for col in data.columns:
        if data[col].dtype == 'O':
            print('There are '+ str(len(data[col].value_counts()))+' Classes in: ' +col)
            

## Preprocessing

In [84]:
date_cols = group_by_name(train, 'date')

# drop date columns
train.drop(date_cols,inplace=True,axis=1)
test.drop(date_cols,inplace=True,axis=1)

# consolidate the languages columns into 1
def sort_lang(df):
    langs = df[['child_languages', 'language_child']]
    exp = []
    for i in range(langs.shape[0]):
        if str(langs['child_languages'].iloc[i]) != 'nan':
            exp.append(langs['child_languages'].iloc[i])
        elif str(langs['language_child'].iloc[i]) != 'nan':
            exp.append(langs['language_child'].iloc[i])  
        else:
            exp.append(langs['language_child'].iloc[i])
            
    return exp
        
train['languages'] = sort_lang(train)
test['languages'] = sort_lang(test)
train.drop(['child_languages', 'language_child'],inplace=True,axis=1)
test.drop(['child_languages', 'language_child'],inplace=True,axis=1)

# count cols
# registered children statistics of the ECD programe the child is registered in
count_cols = []
count_cols = group_by_name(train, 'count')

# fill null values for count cols
for i in train[count_cols]:
    train[i].fillna(0, inplace=True)
    
for i in test[count_cols]:
    test[i].fillna(0, inplace=True)    

In [85]:
# select features according to correlatin to target
train['target'] = target
# get the correlation of each feature with respect to the target (emmission)
train_corr = train.corr()['target'].abs().sort_values(ascending=False)

# get features with a correlation above a certain threshold
features = train_corr[train_corr > 0].index.to_list()[1:]

print(len(features))

train = train[features]
test  = test[features]

# train['target'] = target


159


In [86]:
train = train.fillna('')
test  = test.fillna('')

In [87]:
train.head(2)

Unnamed: 0,child_observe_total,child_age,child_height,pri_fees_amount_2_3,teacher_emotional_total,id_dc_n,id_mn_n,pri_fees_amount,pri_fees_amount_0_1,pri_fees_amount_pv,...,pri_days,language_match,count_staff_gender_other,count_register_year_2013,pri_difficult_walk,count_register_year_school,count_register_year_2014,count_register_race_other,count_register_race_coloured,obs_cooking_4
0,4.0,59.0,,,,134.0,107.0,,,,...,,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,
1,4.0,60.163933,103.0,,7.0,367.0,,,,,...,,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,


In [88]:
test.head(2)

Unnamed: 0,child_observe_total,child_age,child_height,pri_fees_amount_2_3,teacher_emotional_total,id_dc_n,id_mn_n,pri_fees_amount,pri_fees_amount_0_1,pri_fees_amount_pv,...,pri_days,language_match,count_staff_gender_other,count_register_year_2013,pri_difficult_walk,count_register_year_school,count_register_year_2014,count_register_race_other,count_register_race_coloured,obs_cooking_4
0,11.0,57.0,108.0,,12.0,296.0,74.0,300.0,,320.27182,...,5.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,
1,11.0,54.0,105.0,,12.0,13.0,13.0,250.0,,266.8932,...,5.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,


In [89]:
# y = train.target
X_train, X_test, y_train, y_test = train_test_split(
    train,
    target,
    test_size = 0.15,
    random_state = 42,
    shuffle = True
)

In [90]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7297, 159), (1288, 159), (7297, 1), (1288, 1))

In [91]:
# y_test = test[features[:-1]]
train_dataset = cb.Pool(data = X_train, label = y_train)
test_dataset = cb.Pool(data = X_test, label = y_test)
model = cb.CatBoostRegressor(iterations = 30000, learning_rate=0.1, random_seed=123, verbose=300)
model.fit(train_dataset, eval_set=test_dataset, use_best_model=True, early_stopping_rounds=300)
y_pred = model.predict(X_test)
# preds_test  = model.predict(test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))
# 9.847991796804552

0:	learn: 14.6243698	test: 15.1406031	best: 15.1406031 (0)	total: 24.2ms	remaining: 12m 6s
300:	learn: 8.5675488	test: 10.2117615	best: 10.2117615 (300)	total: 2.44s	remaining: 4m
600:	learn: 7.3462701	test: 9.9971037	best: 9.9878644 (585)	total: 4.85s	remaining: 3m 57s
900:	learn: 6.4787989	test: 9.9223070	best: 9.9182507 (881)	total: 6.88s	remaining: 3m 42s
1200:	learn: 5.8346944	test: 9.9433732	best: 9.9015918 (975)	total: 8.82s	remaining: 3m 31s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 9.901591763
bestIteration = 975

Shrink model to first 976 iterations.
9.901591766064161


In [92]:
import shap
expaliner = shap.TreeExplainer(model)
shap_values = expaliner.shap_values(test)

In [93]:
# features = X_test.columns
# len(f['f1'])

In [94]:
f = {
    'f1' : [], 'f2': [], 'f3': [], 'f4': [],'f5': [],
    'f6' : [], 'f7': [], 'f8': [], 'f9': [],'f10': [],
    'f11' : [], 'f12': [], 'f13': [], 'f14': [],'f15': []
}

In [95]:
for shap_value in shap_values:
    arr = np.argsort(shap_value)[::-1][:15]
    for ind, a in enumerate(arr):
        name_f = f'f{ind+1}'
        f[name_f].append(features[a])

In [96]:
# Make predictions on the test set
pred = model.predict(test)
pred

array([56.66659939, 45.4600241 , 53.2240385 , ..., 43.46418791,
       44.06912715, 45.9195719 ])

In [97]:
SampleSubmission['target']    = pred
SampleSubmission['feature_1'] = f['f1']
SampleSubmission['feature_2'] = f['f2']
SampleSubmission['feature_3'] = f['f3']
SampleSubmission['feature_4'] = f['f4']
SampleSubmission['feature_5'] = f['f5']
SampleSubmission['feature_6'] = f['f6']
SampleSubmission['feature_7'] = f['f7']
SampleSubmission['feature_8'] = f['f8']
SampleSubmission['feature_9'] = f['f9']
SampleSubmission['feature_10'] = f['f10']
SampleSubmission['feature_11'] = f['f11']
SampleSubmission['feature_12'] = f['f12']
SampleSubmission['feature_13'] = f['f13']
SampleSubmission['feature_14'] = f['f14']
SampleSubmission['feature_15'] = f['f15']

In [98]:
SampleSubmission

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,56.666599,child_observe_total,teacher_emotional_total,ses_proxy,pri_fees_amount,count_staff_qual_skills,count_children_present,teacher_social_total,count_register_gender_female,count_register_year_grader,id_facility_n,count_toilets_children,quintile,id_team,count_staff_time_full,count_register_year_2017
1,ID_GQ6ONJ4FP,45.460024,child_observe_total,teacher_emotional_total,pri_fees_amount,count_register_year_2015,count_register_year_2020,count_children_present,quintile,teacher_social_total,count_staff_qual_skills,count_children_precovid,id_dc_n,count_staff_gender_male,count_register_year_grader,teacher_duration,obs_classrooms
2,ID_YZ76CVRW3,53.224038,child_observe_total,longitude,id_team,teacher_selfcare_total,pri_fees_amount,id_enumerator,latitude,ses_proxy,count_register_year_2019,id_prov_n,obs_classrooms,quintile,count_register_year_2021,count_register_race_coloured,pri_fees_amount_4_6
3,ID_BNINCRXH8,69.668109,child_observe_total,id_enumerator,teacher_emotional_total,child_age,id_team,teacher_duration,data_year,count_register_gender_female,child_height,pri_funding_subsidy,obs_heating_4,pri_capacity,count_children_present,id_facility_n,child_attendance
4,ID_1U7GDTLRI,44.598468,child_observe_total,id_ward,id_team,child_months_enrolment,latitude,id_facility,longitude,count_register_race_african,count_register_year_2020,count_register_year_grader,count_register_year_2016,count_register_gender,count_register_year_2017,pri_time_close_hours,pri_year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3675,ID_LBPQ2VMQZ,52.244778,child_age,ses_proxy,child_observe_total,id_ward,id_team,teacher_selfcare_total,child_height,count_register_race_african,count_children_attendance,pri_funding_subsidy,longitude,count_register_year_2017,data_year,pri_time_open_hours,count_register_year_2020
3676,ID_H2RKKMMKK,50.058959,child_age,id_enumerator,ward_best,child_height,id_team,id_ward,count_register_year_2020,id_dc_n,count_register_gender_female,pri_fees_amount_pv,count_register_year_2019,count_children_present,teacher_selfcare_total,pri_fees_amount,child_zha
3677,ID_VY8KX7YTZ,43.464188,child_observe_total,child_age,count_register_year_2018,id_ward_n,count_register_race,pri_expense_materials,obs_cooking_6,count_children_precovid,count_register_year_2020,count_register_gender_female,pri_expense_maintenance,teacher_social_total,id_facility_n,pri_time_close_hours,child_zha
3678,ID_EO2MYZ4M7,44.069127,id_enumerator,id_facility,child_height,obs_heating_1,pri_capacity,id_dc_n,pqa_score_relationships,child_zha,count_register_year_2018,count_register_year_2020,longitude,count_register_year_grader,pra_experience,id_ward_n,count_children_present


In [99]:
SampleSubmission.to_csv('submission.csv', index=False)
