In [256]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [257]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission.csv')

In [258]:
train.shape

(8585, 679)

In [259]:
# Preview train
train.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999
2,ID_R00SN7AUD,2022.0,2022-03-11,69.0,,,,,108.400002,Often,...,,,,,,,,,,47.52
3,ID_BSSK60PAZ,2021.0,2021-10-13,53.0,2020-01-15,20.0,No,1st year in the programme,98.099998,Almost always,...,,,,,,,,,,58.599998
4,ID_IZTY6TC4D,2021.0,2021-10-13,57.0,2021-10-13,0.0,,2nd year in programme,114.0,Almost always,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,76.599998


In [260]:
# Preview test
test.head()

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_4,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7
0,ID_0I0999N6S,2021.0,2021-09-20,57.0,,,Yes,2nd year in programme,108.0,Almost always,...,,,,,,,,,,
1,ID_GQ6ONJ4FP,2021.0,2021-10-21,54.0,2021-01-10,9.0,Yes,1st year in the programme,105.0,Almost always,...,,,,,,,,,,
2,ID_YZ76CVRW3,2021.0,2021-05-17,57.0,,,Yes,,101.5,Often,...,,,,,,,,,,
3,ID_BNINCRXH8,2022.0,2022-09-09,59.334702,,,,3rd year in programme,,Almost always,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,ID_1U7GDTLRI,2021.0,2021-10-12,54.0,2021-01-15,8.0,Yes,1st year in the programme,103.5,Often,...,,,,,,,,,,


In [261]:
# Preview submission file
ss.head()

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
1,ID_GQ6ONJ4FP,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
2,ID_YZ76CVRW3,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
3,ID_BNINCRXH8,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
4,ID_1U7GDTLRI,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature


In [262]:
# identify target
target_cols = []
for i in train.columns.values:
    if i not in test.columns.values:
        target_cols.append(i)
        
target = train[target_cols]
train[target_cols].head()

Unnamed: 0,target
0,51.5
1,55.869999
2,47.52
3,58.599998
4,76.599998


## Due to the large number of features we are going to systematically group them into small clusters to allow for feature engineering

In [263]:
# Helper functions

def group_by_name(df, name, starts_with = False):
    arr = []
    if starts_with:
        for col in df.columns:
            if col.startswith(name):
                arr.append(col)   
    else:
        for col in df.columns:
            if name in col:
                arr.append(col)
        
            
    return arr

def analyse_group(df):
    features = []; cat_features = []; not_features = []
    for k in df.columns:
        features.append(k)
        if df[k].dtype == 'O':
            cat_features.append(k)
            print('There are '+ str(len(df[k].value_counts()))+' Classes in: ' +k)
            print('They are '+ str(df[k].unique()))
        else:
            not_features.append(k)

    print('----------------------------------')
    print('We have '+str(len(features)) + ' features')
    print('We have '+str(len(cat_features)) + ' categorical features')
    print('We have '+str(len(not_features)) + ' numerical features')

def count_class(data):
    for col in data.columns:
        if data[col].dtype == 'O':
            print('There are '+ str(len(data[col].value_counts()))+' Classes in: ' +col)
            

In [264]:
# select date columns
# we might wanna drop these.
date_cols = group_by_name(train, 'date')
train[date_cols].head()


Unnamed: 0,child_date,child_enrolment_date,pqa_date,pra_date,pri_date,obs_date
0,2022-02-03,,,,,
1,,,,,,
2,2022-03-11,,,,,
3,2021-10-13,2020-01-15,,,,
4,2021-10-13,2021-10-13,2021-10-29,2021-10-29,2021-10-29,2021-10-29


In [265]:
# train.drop(date_cols,inplace=True,axis=1)

In [241]:
# child cols
child_cols = []
child_cols = group_by_name(train, 'child')
analyse_group(train[child_cols])

train[child_cols].head()

There are 8585 Classes in: child_id
They are ['ID_SYSJ2FM0D' 'ID_J5BTFOZR3' 'ID_R00SN7AUD' ... 'ID_L52DMG5D1'
 'ID_QZQAO2GKX' 'ID_Y61LX4FV3']
There are 4 Classes in: child_grant
They are [nan 'No' 'Yes' "Don't know" 'Refuse']
There are 4 Classes in: child_years_in_programme
They are [nan '1st year in the programme' '2nd year in programme'
 '3rd year in programme' 'Do Not Know']
There are 4 Classes in: child_observe_attentive
They are ['Sometimes' 'Often' 'Almost always' 'Almost never']
There are 4 Classes in: child_observe_concentrated
They are ['Sometimes' 'Almost never' 'Often' 'Almost always']
There are 4 Classes in: child_observe_diligent
They are ['Sometimes' 'Almost always' 'Often' 'Almost never']
There are 4 Classes in: child_observe_interested
They are ['Sometimes' 'Often' 'Almost always' 'Almost never']
There are 2 Classes in: child_gender
They are ['Female' 'Male']
There are 1018 Classes in: child_dob
They are ['2017-02-06' nan '2016-05-24' ... '2016-02-11' '2015-05-16' '2018

Unnamed: 0,child_id,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,...,child_attendance,child_languages,child_age_group,pri_children_4_6_years,obs_toilets_children,count_children_present,count_children_attendance,count_children_precovid,count_toilets_children,language_child
0,ID_SYSJ2FM0D,59.0,,,,,Sometimes,Sometimes,Sometimes,Sometimes,...,,,50-59 months,,,,,,,Sesotho
1,ID_J5BTFOZR3,60.163933,,,1st year in the programme,103.0,Sometimes,Almost never,Sometimes,Often,...,,,60-69 months,,,,,,,isiZulu
2,ID_R00SN7AUD,69.0,,,,108.400002,Often,Often,Sometimes,Often,...,,,60-69 months,,,,,,8.0,Afrikaans
3,ID_BSSK60PAZ,53.0,20.0,No,1st year in the programme,98.099998,Almost always,Almost always,Sometimes,Often,...,,,50-59 months,,No,30.0,38.0,38.0,0.0,isiXhosa
4,ID_IZTY6TC4D,57.0,0.0,,2nd year in programme,114.0,Almost always,Almost always,Almost always,Almost always,...,,,50-59 months,12.0,No,17.0,20.0,30.0,,isiZulu


In [242]:
# consolidate the languages columns into 1
langs = train[['child_languages', 'language_child']]
exp = []
for i in range(langs.shape[0]):
#     print(langs['child_languages'].iloc[i])
    if str(langs['child_languages'].iloc[i]) != 'nan':
        exp.append(langs['child_languages'].iloc[i])
    elif str(langs['language_child'].iloc[i]) != 'nan':
        exp.append(langs['language_child'].iloc[i])  
    else:
        exp.append(langs['language_child'].iloc[i])
#         print(f" {i} both null 1:{langs['child_languages'].iloc[i]}   2: {langs['language_child'].iloc[i]}" )
        
train['languages'] = exp
train.drop(['child_languages', 'language_child'],inplace=True,axis=1)

In [243]:
train['languages']

0         Sesotho
1         isiZulu
2       Afrikaans
3        isiXhosa
4         isiZulu
          ...    
8580      Sesotho
8581     Setswana
8582      English
8583     isiXhosa
8584      isiZulu
Name: languages, Length: 8585, dtype: object

In [244]:
# count cols
# registered children statistics of the ECD programe the child is registered in
count_cols = []
count_cols = group_by_name(train, 'count')
analyse_group(train[count_cols])

train[count_cols].head()

----------------------------------
We have 63 features
We have 0 categorical features
We have 63 numerical features


Unnamed: 0,count_register_all,count_staff_all,count_children_present,count_children_attendance,count_children_precovid,count_register_gender_female,count_register_gender_male,count_register_gender_other,count_register_gender,count_register_year_2021,...,count_staff_paid_support,count_staff_paid,count_practitioners_all,count_practitioners_age_0,count_practitioners_age_1,count_practitioners_age_2,count_practitioners_age_3,count_practitioners_age_4,count_practitioners_age_5,count_practitioners_age_6
0,38.0,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,38.0,5.0,30.0,38.0,38.0,21.0,17.0,0.0,38.0,,...,,,,,,,,,,
4,27.0,3.0,17.0,20.0,30.0,16.0,11.0,0.0,27.0,0.0,...,,,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [245]:
# Breakdown of count columns

# count_register cols
# registered children statistics of the ECD programe the child is registered in
count_register_cols = []
count_register_cols = group_by_name(train, 'count_register')
analyse_group(train[count_register_cols])

# count_staff cols
# staff statistics of the ECD programe the child is registered in
count_staff_cols = []
count_staff_cols = group_by_name(train, 'count_staff')
analyse_group(train[count_staff_cols])

# count_children cols
# attendance statistics of the ECD programe the child is registered in
count_children_cols = []
count_children_cols = group_by_name(train, 'count_children')
analyse_group(train[count_children_cols])

# count_practitioners cols
# managerial staff taht also work as ECD practitioners statistics of the ECD programe the child is registered in
count_practitioners_cols = []
count_practitioners_cols = group_by_name(train, 'count_practitioners')
analyse_group(train[count_practitioners_cols])

# count_practitioners cols
# toilet statistics of the ECD programe the child is registered in
count_toilets_cols = []
count_toilets_cols = group_by_name(train, 'count_toilets')
analyse_group(train[count_toilets_cols])

# count_practitioners cols
# children present on day of assesment
count_present_cols = []
count_present_cols = group_by_name(train, 'count_present')
analyse_group(train[count_present_cols])

----------------------------------
We have 23 features
We have 0 categorical features
We have 23 numerical features
----------------------------------
We have 25 features
We have 0 categorical features
We have 25 numerical features
----------------------------------
We have 3 features
We have 0 categorical features
We have 3 numerical features
----------------------------------
We have 8 features
We have 0 categorical features
We have 8 numerical features
----------------------------------
We have 2 features
We have 0 categorical features
We have 2 numerical features
----------------------------------
We have 2 features
We have 0 categorical features
We have 2 numerical features


In [269]:
# for i in train[count_cols]:
#     train[i].fillna((train[i].mean()), inplace=True)

count_register_all
count_staff_all
count_children_present
count_children_attendance
count_children_precovid
count_register_gender_female
count_register_gender_male
count_register_gender_other
count_register_gender
count_register_year_2021
count_register_year_2020
count_register_year_2019
count_register_year_2018
count_register_year_2017
count_register_year_2016
count_register_year_2015
count_register_year_2014
count_register_year_2013
count_register_year_school
count_register_year_grader
count_register_race_african
count_register_race_coloured
count_register_race_indian
count_register_race_white
count_register_race_other
count_register_race
count_staff_salary_paid
count_staff_salary_unpaid
count_staff_salary
count_staff_gender_female
count_staff_gender_male
count_staff_gender_other
count_staff_gender
count_staff_contract
count_staff_time_full
count_staff_time
count_staff_qual_skills
count_staff_qual_nqf4_5
count_staff_qual_nqf6_9
count_toilets_children
count_toilets_adults
count_regist

In [247]:
train.shape

(8585, 672)

In [248]:
features = []; cat_features = []; not_features = []
for k in train.columns[1:]:
    if train[k].isna().sum() < 3000:
        features.append(k)
        if train[k].dtype == 'O':
            cat_features.append(k)
            print('There are '+ str(len(train[k].value_counts()))+' Classes in: ' +k)
    else:
        not_features.append(k)

print('----------------------------------')
print('We have '+str(len(features)) + ' features')
print('We have '+str(len(cat_features)) + ' categorical features')
print('We have '+str(len(not_features)) + ' features that have more than 4000 of missing values')

There are 4 Classes in: child_years_in_programme
There are 4 Classes in: child_observe_attentive
There are 4 Classes in: child_observe_concentrated
There are 4 Classes in: child_observe_diligent
There are 4 Classes in: child_observe_interested
There are 2 Classes in: child_gender
There are 1018 Classes in: child_dob
There are 3 Classes in: child_stunted
There are 4 Classes in: child_age_group
There are 153 Classes in: id_mn_best
There are 10 Classes in: prov_best
There are 50 Classes in: id_dc_best
There are 50 Classes in: dc_best
There are 153 Classes in: mn_best
There are 2 Classes in: pra_ind
There are 2 Classes in: teacher_social_met
There are 2 Classes in: teacher_emotional_met
There are 2 Classes in: teacher_selfcare_met
There are 2 Classes in: hle_ind
There are 9 Classes in: id_prov
There are 2 Classes in: census
There are 13 Classes in: language_assessment
There are 2 Classes in: sef_ind
There are 2 Classes in: elp_ind
There are 2 Classes in: gps_ind
There are 2 Classes in: pre

In [249]:
len(features)

115

In [250]:
train.shape

(8585, 672)

In [251]:
# get the correlation of each feature with respect to the target (emmission)
train_corr = train.corr()['target'].abs().sort_values(ascending=False)
train_corr

target                          1.000000
child_observe_total             0.444812
child_age                       0.424989
child_height                    0.299639
pri_fees_amount_2_3             0.293291
                                  ...   
count_register_year_school      0.001454
count_register_year_2014        0.001344
count_register_race_other       0.000767
count_register_race_coloured    0.000395
obs_cooking_4                   0.000066
Name: target, Length: 160, dtype: float64

In [252]:
max(train_corr), min(train_corr)

(1.0, 6.552845715764906e-05)

In [253]:
len(train_corr)

160

In [254]:
print(len(train_corr[train_corr > 0].index.to_list()[1:]))

159
