In [1]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission.csv')

print(train.shape)
print(test.shape)

(8585, 679)
(3680, 678)


In [3]:
# Preview train
train.head(2)

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,2022-02-03,59.0,,,,,,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,,60.163933,,,,1st year in the programme,103.0,Sometimes,...,,,,,,,,,,55.869999


In [4]:
# Preview test
test.head(2)

Unnamed: 0,child_id,data_year,child_date,child_age,child_enrolment_date,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,...,obs_cooking_4,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7
0,ID_0I0999N6S,2021.0,2021-09-20,57.0,,,Yes,2nd year in programme,108.0,Almost always,...,,,,,,,,,,
1,ID_GQ6ONJ4FP,2021.0,2021-10-21,54.0,2021-01-10,9.0,Yes,1st year in the programme,105.0,Almost always,...,,,,,,,,,,


In [5]:
# Preview submission file
ss.head(2)

Unnamed: 0,child_id,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15
0,ID_0I0999N6S,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature
1,ID_GQ6ONJ4FP,0,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature,feature


In [6]:
# identify target
target_cols = []
for i in train.columns.values:
    if i not in test.columns.values:
        target_cols.append(i)
        
target = train[target_cols]
train[target_cols].head(2)

Unnamed: 0,target
0,51.5
1,55.869999


## Due to the large number of features we are going to systematically group them into small clusters to allow for feature engineering

In [7]:
# Helper functions

def group_by_name(df, name, starts_with = False):
    arr = []
    if starts_with:
        for col in df.columns:
            if col.startswith(name):
                arr.append(col)   
    else:
        for col in df.columns:
            if name in col:
                arr.append(col)
        
            
    return arr

def analyse_group(df):
    features = []; cat_features = []; not_features = []
    for k in df.columns:
        features.append(k)
        if df[k].dtype == 'O':
            cat_features.append(k)
            print('There are '+ str(len(df[k].value_counts()))+' Classes in: ' +k)
            print('They are '+ str(df[k].unique()))
        else:
            not_features.append(k)

    print('----------------------------------')
    print('We have '+str(len(features)) + ' features')
    print('We have '+str(len(cat_features)) + ' categorical features')
    print('We have '+str(len(not_features)) + ' numerical features')
    

def count_class(data):
    for col in data.columns:
        if data[col].dtype == 'O':
            print('There are '+ str(len(data[col].value_counts()))+' Classes in: ' +col)
            
            
def strip_html(df, cols):
    for col in df[cols]:
        for i in range(0, len(df[col])):
            if 'Inadequate' in str(df[col].iloc[[i]]):
                df[col].iloc[[i]] = 'Inadequate'
            elif 'Basic' in str(df[col].iloc[[i]]):
                df[col].iloc[[i]] = 'Basic'
            elif 'Good' in str(df[col].iloc[[i]]):
                df[col].iloc[[i]] = 'Good'

In [8]:
new_train = pd.DataFrame()


### Date Cols

In [9]:
# select date columns
# we might wanna drop these.
date_cols = group_by_name(train, 'date')
train[date_cols].head()


Unnamed: 0,child_date,child_enrolment_date,pqa_date,pra_date,pri_date,obs_date
0,2022-02-03,,,,,
1,,,,,,
2,2022-03-11,,,,,
3,2021-10-13,2020-01-15,,,,
4,2021-10-13,2021-10-13,2021-10-29,2021-10-29,2021-10-29,2021-10-29


In [10]:
# train.drop(date_cols, inplace=True)
train.drop(date_cols, inplace=True,axis=1)
train.head(2)

Unnamed: 0,child_id,data_year,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,...,obs_cooking_5,obs_cooking_6,obs_heating_1,obs_heating_2,obs_heating_3,obs_heating_4,obs_heating_5,obs_heating_6,obs_heating_7,target
0,ID_SYSJ2FM0D,2022.0,59.0,,,,,Sometimes,Sometimes,Sometimes,...,,,,,,,,,,51.5
1,ID_J5BTFOZR3,2019.0,60.163933,,,1st year in the programme,103.0,Sometimes,Almost never,Sometimes,...,,,,,,,,,,55.869999


### Child Cols

In [11]:
# child cols
child_cols = []
child_cols = group_by_name(train, 'child')

train[child_cols].head(2)

Unnamed: 0,child_id,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,...,child_attendance,child_languages,child_age_group,pri_children_4_6_years,obs_toilets_children,count_children_present,count_children_attendance,count_children_precovid,count_toilets_children,language_child
0,ID_SYSJ2FM0D,59.0,,,,,Sometimes,Sometimes,Sometimes,Sometimes,...,,,50-59 months,,,,,,,Sesotho
1,ID_J5BTFOZR3,60.163933,,,1st year in the programme,103.0,Sometimes,Almost never,Sometimes,Often,...,,,60-69 months,,,,,,,isiZulu


In [12]:
analyse_group(train[child_cols])

There are 8585 Classes in: child_id
They are ['ID_SYSJ2FM0D' 'ID_J5BTFOZR3' 'ID_R00SN7AUD' ... 'ID_L52DMG5D1'
 'ID_QZQAO2GKX' 'ID_Y61LX4FV3']
There are 4 Classes in: child_grant
They are [nan 'No' 'Yes' "Don't know" 'Refuse']
There are 4 Classes in: child_years_in_programme
They are [nan '1st year in the programme' '2nd year in programme'
 '3rd year in programme' 'Do Not Know']
There are 4 Classes in: child_observe_attentive
They are ['Sometimes' 'Often' 'Almost always' 'Almost never']
There are 4 Classes in: child_observe_concentrated
They are ['Sometimes' 'Almost never' 'Often' 'Almost always']
There are 4 Classes in: child_observe_diligent
They are ['Sometimes' 'Almost always' 'Often' 'Almost never']
There are 4 Classes in: child_observe_interested
They are ['Sometimes' 'Often' 'Almost always' 'Almost never']
There are 2 Classes in: child_gender
They are ['Female' 'Male']
There are 1018 Classes in: child_dob
They are ['2017-02-06' nan '2016-05-24' ... '2016-02-11' '2015-05-16' '2018

In [13]:
# consolidate the languages columns into 1
langs = train[['child_languages', 'language_child']]
exp = []
for i in range(langs.shape[0]):
#     print(langs['child_languages'].iloc[i])
    if str(langs['child_languages'].iloc[i]) != 'nan':
        exp.append(langs['child_languages'].iloc[i])
    elif str(langs['language_child'].iloc[i]) != 'nan':
        exp.append(langs['language_child'].iloc[i])  
    else:
        exp.append(langs['language_child'].iloc[i])
#         print(f" {i} both null 1:{langs['child_languages'].iloc[i]}   2: {langs['language_child'].iloc[i]}" )
        
train['languages'] = exp
train.drop(['child_languages', 'language_child'],inplace=True,axis=1)

child_cols.remove('child_languages')
child_cols.remove('language_child')
child_cols.remove('child_id')

child_cols.insert(-1, 'languages')

In [14]:
# consolidate the languages columns into 1 in test dataset
langs = test[['child_languages', 'language_child']]
exp = []
for i in range(langs.shape[0]):
#     print(langs['child_languages'].iloc[i])
    if str(langs['child_languages'].iloc[i]) != 'nan':
        exp.append(langs['child_languages'].iloc[i])
    elif str(langs['language_child'].iloc[i]) != 'nan':
        exp.append(langs['language_child'].iloc[i])  
    else:
        exp.append(langs['language_child'].iloc[i])
#         print(f" {i} both null 1:{langs['child_languages'].iloc[i]}   2: {langs['language_child'].iloc[i]}" )
        
test['languages'] = exp
test.drop(['child_languages', 'language_child'],inplace=True,axis=1)


In [15]:
train['languages']

0         Sesotho
1         isiZulu
2       Afrikaans
3        isiXhosa
4         isiZulu
          ...    
8580      Sesotho
8581     Setswana
8582      English
8583     isiXhosa
8584      isiZulu
Name: languages, Length: 8585, dtype: object

In [16]:
new_train[child_cols] = train[child_cols]
new_train

Unnamed: 0,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,child_observe_total,...,child_attends,child_attendance,child_age_group,pri_children_4_6_years,obs_toilets_children,count_children_present,count_children_attendance,count_children_precovid,languages,count_toilets_children
0,59.000000,,,,,Sometimes,Sometimes,Sometimes,Sometimes,4.0,...,,,50-59 months,,,,,,Sesotho,
1,60.163933,,,1st year in the programme,103.000000,Sometimes,Almost never,Sometimes,Often,4.0,...,,,60-69 months,,,,,,isiZulu,
2,69.000000,,,,108.400002,Often,Often,Sometimes,Often,7.0,...,,,60-69 months,,,,,,Afrikaans,8.0
3,53.000000,20.0,No,1st year in the programme,98.099998,Almost always,Almost always,Sometimes,Often,9.0,...,,,50-59 months,,No,30.0,38.0,38.0,isiXhosa,0.0
4,57.000000,0.0,,2nd year in programme,114.000000,Almost always,Almost always,Almost always,Almost always,12.0,...,,,50-59 months,12.0,No,17.0,20.0,30.0,isiZulu,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8580,55.000000,9.0,Yes,1st year in the programme,102.300003,Often,Often,Sometimes,Sometimes,6.0,...,,,50-59 months,46.0,Yes,55.0,67.0,101.0,Sesotho,4.0
8581,55.000000,32.0,,2nd year in programme,102.599998,Often,Sometimes,Sometimes,Almost never,4.0,...,,,50-59 months,23.0,Yes,30.0,33.0,60.0,Setswana,2.0
8582,56.000000,45.0,,3rd year in programme,103.800003,Almost always,Almost always,Almost always,Almost always,12.0,...,,,50-59 months,23.0,Yes,13.0,25.0,40.0,English,2.0
8583,57.000000,9.0,Yes,1st year in the programme,102.400002,Almost always,Almost always,Almost always,Almost always,12.0,...,,,50-59 months,,,32.0,38.0,51.0,isiXhosa,


### Count Cols

In [17]:
# count cols
# registered children statistics of the ECD programe the child is registered in
count_cols = []
count_cols = group_by_name(train, 'count')
analyse_group(train[count_cols])

train[count_cols].head()

----------------------------------
We have 63 features
We have 0 categorical features
We have 63 numerical features


Unnamed: 0,count_register_all,count_staff_all,count_children_present,count_children_attendance,count_children_precovid,count_register_gender_female,count_register_gender_male,count_register_gender_other,count_register_gender,count_register_year_2021,...,count_staff_paid_support,count_staff_paid,count_practitioners_all,count_practitioners_age_0,count_practitioners_age_1,count_practitioners_age_2,count_practitioners_age_3,count_practitioners_age_4,count_practitioners_age_5,count_practitioners_age_6
0,38.0,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,38.0,5.0,30.0,38.0,38.0,21.0,17.0,0.0,38.0,,...,,,,,,,,,,
4,27.0,3.0,17.0,20.0,30.0,16.0,11.0,0.0,27.0,0.0,...,,,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [18]:
# Breakdown of count columns

# count_register cols
# registered children statistics of the ECD programe the child is registered in
count_register_cols = []
count_register_cols = group_by_name(train, 'count_register')
analyse_group(train[count_register_cols])

# count_staff cols
# staff statistics of the ECD programe the child is registered in
count_staff_cols = []
count_staff_cols = group_by_name(train, 'count_staff')
analyse_group(train[count_staff_cols])

# count_children cols
# attendance statistics of the ECD programe the child is registered in
count_children_cols = []
count_children_cols = group_by_name(train, 'count_children')
analyse_group(train[count_children_cols])

# count_practitioners cols
# managerial staff taht also work as ECD practitioners statistics of the ECD programe the child is registered in
count_practitioners_cols = []
count_practitioners_cols = group_by_name(train, 'count_practitioners')
analyse_group(train[count_practitioners_cols])

# count_toilets cols
# toilet statistics of the ECD programe the child is registered in
count_toilets_cols = []
count_toilets_cols = group_by_name(train, 'count_toilets')
analyse_group(train[count_toilets_cols])

# count_present cols
# children present on day of assesment
count_present_cols = []
count_present_cols = group_by_name(train, 'count_present')
analyse_group(train[count_present_cols])

----------------------------------
We have 23 features
We have 0 categorical features
We have 23 numerical features
----------------------------------
We have 25 features
We have 0 categorical features
We have 25 numerical features
----------------------------------
We have 3 features
We have 0 categorical features
We have 3 numerical features
----------------------------------
We have 8 features
We have 0 categorical features
We have 8 numerical features
----------------------------------
We have 2 features
We have 0 categorical features
We have 2 numerical features
----------------------------------
We have 2 features
We have 0 categorical features
We have 2 numerical features


In [19]:
new_train[count_cols] = train[count_cols]

In [20]:
# obs cols
obs_cols = []
obs_cols = group_by_name(train, 'obs', starts_with=True)
analyse_group(train[obs_cols])

There are 2 Classes in: obs_firstaid
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_space
They are [nan 'No' 'Yes']
There are 167 Classes in: obs_area
They are [nan '1 4 6' '1 2 3 4' '1 2 4 6 7' '3 4' '1 2 3 4 5 6' '1 4 5 6' '1 5'
 '1 2' '1 5 6' '2' '1 2 4' '4 5 6' '1 2 3 6' '1 2 3 4 5 6 7 8' '4'
 '1 2 3 4 5 6 7' '1 4' '1 2 3 6 8' '1 2 3 4 5 7' '1 2 3 4 7' '0' '1 3 4 5'
 '1 2 3 4 6 8' '2 4 6' '6' '1 2 4 6 8' '2 4 5 6' '1 3 4 5 6 7 8' '1' '4 6'
 '1 3 6' '2 3 4 5 6 7' '1 3 4 6 7' '1 2 3 4 6 7' '1 2 6' '1 2 4 5 6 7'
 '1 2 4 6' '1 2 4 5 6' '2 3 5 6 8' '1 4 5 6 7' '2 3 4 5 6 7 8' '1 5 6 7'
 '1 3 4 5 6' '2 6' '1 2 4 7' '1 2 3 4 6' '1 2 3 4 5 6 8' '4 5 6 8' '1 6'
 '1 2 5 6' '1 4 6 8' '1 2 3 4 5' '1 2 4 5 7 8' '1 2 7' '1 2 3 5' '1 4 7'
 '1 2 4 8' '1 2 5 6 7' '1 2 3 7' '4 5 6 7' '2 3 4 6' '1 2 3' '1 4 5' '5 6'
 '1 2 3 4 7 8' '1 3 5' '5 6 7 8' '1 2 3 4 6 7 8' '1 2 4 6 7 8' '5 6 7'
 '1 3 4 5 6 8' '2 4 6 7' '2 3 5 6' '1 2 3 5 6 8' '2 3 4 6 8' '1 3 5 7 8'
 '1 2 5 6 7 8' '1 3 4 6 8' '1 3 4 6 

In [21]:
''

''

In [22]:
# obs_area cols
obs_area_cols = []
obs_area_cols = group_by_name(train, 'obs_area',)
analyse_group(train[obs_area_cols])

train[obs_area_cols].head()

There are 167 Classes in: obs_area
They are [nan '1 4 6' '1 2 3 4' '1 2 4 6 7' '3 4' '1 2 3 4 5 6' '1 4 5 6' '1 5'
 '1 2' '1 5 6' '2' '1 2 4' '4 5 6' '1 2 3 6' '1 2 3 4 5 6 7 8' '4'
 '1 2 3 4 5 6 7' '1 4' '1 2 3 6 8' '1 2 3 4 5 7' '1 2 3 4 7' '0' '1 3 4 5'
 '1 2 3 4 6 8' '2 4 6' '6' '1 2 4 6 8' '2 4 5 6' '1 3 4 5 6 7 8' '1' '4 6'
 '1 3 6' '2 3 4 5 6 7' '1 3 4 6 7' '1 2 3 4 6 7' '1 2 6' '1 2 4 5 6 7'
 '1 2 4 6' '1 2 4 5 6' '2 3 5 6 8' '1 4 5 6 7' '2 3 4 5 6 7 8' '1 5 6 7'
 '1 3 4 5 6' '2 6' '1 2 4 7' '1 2 3 4 6' '1 2 3 4 5 6 8' '4 5 6 8' '1 6'
 '1 2 5 6' '1 4 6 8' '1 2 3 4 5' '1 2 4 5 7 8' '1 2 7' '1 2 3 5' '1 4 7'
 '1 2 4 8' '1 2 5 6 7' '1 2 3 7' '4 5 6 7' '2 3 4 6' '1 2 3' '1 4 5' '5 6'
 '1 2 3 4 7 8' '1 3 5' '5 6 7 8' '1 2 3 4 6 7 8' '1 2 4 6 7 8' '5 6 7'
 '1 3 4 5 6 8' '2 4 6 7' '2 3 5 6' '1 2 3 5 6 8' '2 3 4 6 8' '1 3 5 7 8'
 '1 2 5 6 7 8' '1 3 4 6 8' '1 3 4 6 7 8' '1 3 5 6 7 8' '1 2 4 5 6 8'
 '1 2 5 7' '2 3 4' '1 7' '1 3 4 5 6 7' '2 3 4 6 7' '3 4 6 8' '1 2 3 4 5 8'
 '1 3 7' '1 3 5

Unnamed: 0,obs_area,obs_area_1,obs_area_2,obs_area_3,obs_area_4,obs_area_5,obs_area_6,obs_area_7,obs_area_8,obs_area_0
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,1 4 6,Yes,No,No,Yes,No,Yes,No,No,No
4,1 2 3 4,Yes,Yes,Yes,Yes,No,No,No,No,No


In [23]:
train.drop(['obs_area_1', 'obs_area_1', 'obs_area_3', 'obs_area_4', 
                            'obs_area_5', 'obs_area_6', 'obs_area_7', 'obs_area_8', 'obs_area_0'],inplace=True,axis=1)

new_train['obs_area'] = train['obs_area']

In [24]:
# obs_materials cols
obs_materials_cols = []
obs_materials_cols = group_by_name(train, 'obs_materials', starts_with=True)
analyse_group(train[obs_materials_cols])

train[obs_materials_cols].head()

There are 1107 Classes in: obs_materials
They are [nan '1 2 4 5 6 7 8 9 10 11 13 17 18' '1 2 3 4 5 6 10 12 13 16' ...
 '1 3 4 10 12' '1 2 3 5 6 7 8 10 11 16' '1 2 3 4 5 7 8 15 16']
There are 2 Classes in: obs_materials_1
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_2
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_3
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_materials_4
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_5
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_6
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_7
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_8
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_9
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_10
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_11
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_materials_12
They are [nan 'No' 'Yes']
There are 2 Classes in: 

Unnamed: 0,obs_materials,obs_materials_1,obs_materials_2,obs_materials_3,obs_materials_4,obs_materials_5,obs_materials_6,obs_materials_7,obs_materials_8,obs_materials_9,...,obs_materials_14,obs_materials_15,obs_materials_16,obs_materials_17,obs_materials_18,obs_materials_19,obs_materials_20,obs_materials_97,obs_materials_0,obs_materialsother
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,1 2 4 5 6 7 8 9 10 11 13 17 18,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,...,No,No,No,Yes,Yes,No,No,No,No,
4,1 2 3 4 5 6 10 12 13 16,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No,...,No,No,Yes,No,No,No,,No,No,


In [25]:
train.drop(['obs_materials_1', 'obs_materials_2', 'obs_materials_3',
            'obs_materials_4', 'obs_materials_5', 'obs_materials_6', 
            'obs_materials_7', 'obs_materials_8', 'obs_materials_9',
            'obs_materials_10', 'obs_materials_11', 'obs_materials_12',
            'obs_materials_13', 'obs_materials_14', 'obs_materials_15',
            'obs_materials_16', 'obs_materials_17','obs_materials_18',
            'obs_materials_19', 'obs_materials_20', 'obs_materials_97',
            'obs_materials_0', 'obs_materialsother'
],inplace=True,axis=1)

new_train['obs_materials'] = train['obs_materials']

In [26]:
# obs_handwashing cols
obs_handwashing_cols = []
obs_handwashing_cols = group_by_name(train, 'obs_handwashing', starts_with=True)
analyse_group(train[obs_handwashing_cols])

train[obs_handwashing_cols].head()

There are 15 Classes in: obs_handwashing
They are [nan '1' '1 3' '1 3 97' '3' '1 2' '2' '1 2 3' '2 3' '1 97' '97' '1 2 97'
 '3 97' '0' '2 3 97' '2 97']
There are 2 Classes in: obs_handwashing_1
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_handwashing_2
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_handwashing_3
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_handwashing_0
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_handwashing_97
They are [nan 'No' 'Yes']
There are 4 Classes in: obs_handwashingother
They are [nan 'SANITIZER' 'HAND SANITIZER' 'BOTTLE SPRAY' 'KETTLE AND TUB']
There are 2 Classes in: obs_handwashing_friendly
They are [nan 'Yes' 'No']
----------------------------------
We have 8 features
We have 8 categorical features
We have 0 numerical features


Unnamed: 0,obs_handwashing,obs_handwashing_1,obs_handwashing_2,obs_handwashing_3,obs_handwashing_0,obs_handwashing_97,obs_handwashingother,obs_handwashing_friendly
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,1.0,Yes,No,No,No,No,,
4,1.0,Yes,No,No,No,No,,Yes


In [27]:
train.drop(['obs_handwashing_1', 'obs_handwashing_2', 'obs_handwashing_3', 'obs_handwashing_0',
            'obs_handwashing_97'
],inplace=True,axis=1)

new_train['obs_handwashing'] = train['obs_handwashing']

In [28]:
# obs_toilet cols
obs_toilet_cols = []
obs_toilet_cols = group_by_name(train, 'obs_toilet', starts_with=True)
analyse_group(train[obs_toilet_cols])

train[obs_toilet_cols].head()

There are 46 Classes in: obs_toilet
They are [nan '1' '5' '1 7' '5 7' '4 7' '3' '2 4' '4' '4 5' '1 4' '2' '3 5' '1 3'
 '7' '1 6' '3 7' '0' '6' '1 8' '4 6' '2 3' '5 6' '2 4 7' '1 5' '2 7'
 '1 5 6 7' '2 5' '4 5 7' '4 6 7' '1 4 7' '1 2 7' '3 4' '1 5 7' '1 6 7'
 '1 2 6 7' '6 7' '6 97' '3 5 7' '2 8 97' '5 6 7' '3 6' '3 4 6 7' '1 3 7'
 '1 2 6' '1 2 5' '2 6 8']
There are 2 Classes in: obs_toilet_1
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_toilet_2
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_toilet_3
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_toilet_4
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_toilet_5
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_toilet_6
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_toilet_7
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_toilet_8
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_toilet_0
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_toilet_97
They are [nan 'No' 'Yes']
There are 2 

Unnamed: 0,obs_toilet,obs_toilet_1,obs_toilet_2,obs_toilet_3,obs_toilet_4,obs_toilet_5,obs_toilet_6,obs_toilet_7,obs_toilet_8,obs_toilet_0,obs_toilet_97,obs_toilets_children,obs_toilets_gender,obs_toilet_clean,obs_toilet_paper
0,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,
2,,Yes,,,No,,,,,No,,,Yes,,
3,1.0,Yes,No,No,No,No,No,No,No,No,No,No,No,,
4,5.0,No,No,No,No,Yes,No,No,No,No,No,No,No,Yes,Yes


In [29]:
train.drop(['obs_toilet_1', 'obs_toilet_2', 'obs_toilet_3', 'obs_toilet_4', 'obs_toilet_5', 'obs_toilet_6',
            'obs_toilet_7', 'obs_toilet_8','obs_toilet_0','obs_toilet_97'],inplace=True,axis=1)

new_train['obs_toilet'] = train['obs_toilet']
new_train[['obs_toilets_children',
           'obs_toilets_gender',
           'obs_toilet_clean',
           'obs_toilet_paper']] = train[['obs_toilets_children',
                                         'obs_toilets_gender',
                                         'obs_toilet_clean',
                                         'obs_toilet_paper']]


In [30]:
# obs_equipment cols
obs_equipment_cols = []
obs_equipment_cols = group_by_name(train, 'obs_equipment', starts_with=True)
analyse_group(train[obs_equipment_cols])

train[obs_equipment_cols].head()

There are 28 Classes in: obs_equipment
They are [nan '0' '1' '2 3' '1 2 3 4' '1 3' '2 3 4' '1 2 3' '2 4' '1 2' '1 2 3 5'
 '2' '2 3 4 5' '3' '2 3 5' '1 2 3 4 5' '1 2 4' '1 3 4' '1 4' '2 5' '3 4'
 '4' '1 2 5' '-1' '1 2 4 5' '1 3 5' '1 3 4 5' '1 5' '4 5']
There are 2 Classes in: obs_equipment_0
They are [nan 'Yes' 'No']
There are 2 Classes in: obs_equipment_1
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_equipment_2
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_equipment_3
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_equipment_4
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_equipment_5
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_equipment__1
They are [nan 'No' 'Yes']
----------------------------------
We have 8 features
We have 8 categorical features
We have 0 numerical features


Unnamed: 0,obs_equipment,obs_equipment_0,obs_equipment_1,obs_equipment_2,obs_equipment_3,obs_equipment_4,obs_equipment_5,obs_equipment__1
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,0.0,Yes,No,No,No,No,No,No
4,1.0,No,Yes,No,No,No,,No


In [31]:
train.drop(['obs_equipment_0', 'obs_equipment_1', 'obs_equipment_2', 'obs_equipment_3', 'obs_equipment_4', 'obs_equipment_5', 
            'obs_equipment__1'],inplace=True,axis=1)

new_train['obs_equipment'] = train['obs_equipment']

In [32]:
# obs_access_disability cols
obs_access_disability_cols = []
obs_access_disability_cols = group_by_name(train, 'obs_access_disability', starts_with=True)
analyse_group(train[obs_access_disability_cols])


There are 2 Classes in: obs_access_disability_1
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_access_disability_2
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_access_disability_3
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_access_disability_4
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_access_disability_5
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_access_disability_6
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_access_disability_0
They are [nan 'Yes' 'No']
There are 37 Classes in: obs_access_disability
They are [nan '0' '4 5 6' '4 5' '6' '3 4 5 6' '1 2 3 4' '3 5 6' '2 3 4 5 6' '5'
 '1 4 5 6' '1 4 5' '4 6' '5 6' '1 3 4 5' '3 6' '1' '1 3 4 5 6'
 '1 2 3 4 5 6' '1 4 6' '2 4' '1 2 4 5 6' '4' '1 5' '1 5 6' '3 4 6' '1 6'
 '1 4' '2 6' '2' '1 3 5 6' '1 2 4' '3 5' '1 3 4 6' '1 3 5' '3 4 5' '3 4'
 '1 2 6']
----------------------------------
We have 8 features
We have 8 categorical features
We have 0 numerical features


In [33]:
train.drop(['obs_access_disability_1', 'obs_access_disability_2', 'obs_access_disability_3',
            'obs_access_disability_4', 'obs_access_disability_5', 'obs_access_disability_6',
            'obs_access_disability_0'],inplace=True,axis=1)

new_train['obs_access_disability'] = train['obs_access_disability']

In [34]:
# obs_safety cols
obs_safety_cols = []
obs_safety_cols = group_by_name(train, 'obs_safety', starts_with=True)
analyse_group(train[obs_safety_cols])

train[obs_safety_cols].head()

There are 249 Classes in: obs_safety
They are [nan '3 4 5 7 8' '1 2 4 5 7 8 10' '4 5 8 9 10' '1 2 3 5 7 8' '8'
 '1 2 3 8 10' '2 4 5 7 8' '1 2 3 4 5 6 7 9 10' '1 3 5 6 7 8'
 '1 2 3 5 6 7 8' '2 3 7 8 9 10' '1 2 3 5 6 7 8 9' '2 3 4 6 7 8 9 10'
 '1 2 3 5 6 7 8 10' '2 6 7 8' '1 2 7 8 10' '2 4 5 6 8 9 10' '2 3 7 8'
 '1 2 5 7 8' '2 4 6 7 8 9 10' '1 2 4 7 8 9' '2 4 5 7 8 9 10'
 '1 2 3 4 5 6 7 8 9 10' '8 9 10' '0' '1 2 3 4 5 7 8' '2 4 8' '4 5 6 9 10'
 '2 8 9 10' '2 3 5 7 8' '2 4 6 8 9 10' '7 8' '2 5 8 10' '1 2 5 6 8 9 10'
 '4' '2 3 5 6 7 8' '2 5' '3 6 9' '1 2 5 6 7 8 9' '1 3 4 8 9 10' '2'
 '2 3 4 5 6 8 9 10' '1 2 3 4 5 6 7 8' '1 2 5 6 7 8 9 10' '2 4 5 8 9 10'
 '1 2 5 6 8' '5 6 7 8 9 10' '3 5 8 9 10' '2 3 5 7 8 10' '1 2 3 4 8 9'
 '1 2 3 4 5 6 7 8 10' '5 7 10' '5 7 8' '1 2 3 6 7 8 10' '8 9' '2 3 5'
 '2 3 6 7 8' '1 2 4 5 6 7 8 9 10' '2 3 4 8 9 10' '5 8' '3' '3 7 8 9'
 '1 2 7 8 9 10' '3 7 8' '4 5' '2 3 7 8 9' '4 8' '2 5 6 8 9 10' '3 6 7'
 '2 4 8 9 10' '2 4 5 8 10' '2 7 8 10' '2 5 6 7 8 9 10' '2 5 6

Unnamed: 0,obs_safety,obs_safety_1,obs_safety_2,obs_safety_3,obs_safety_4,obs_safety_5,obs_safety_6,obs_safety_7,obs_safety_8,obs_safety_9,obs_safety_10,obs_safety_0
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,,,,,,,,,,,,
3,,,,,,,,,,,,
4,3 4 5 7 8,No,No,Yes,Yes,Yes,No,Yes,Yes,No,No,No


In [35]:
train.drop(['obs_safety_1', 'obs_safety_2', 'obs_safety_3', 'obs_safety_4', 'obs_safety_5',
            'obs_safety_6', 'obs_safety_7', 'obs_safety_8', 'obs_safety_9', 'obs_safety_10', 
            'obs_safety_0'],inplace=True,axis=1)

new_train['obs_safety'] = train['obs_safety']

In [36]:
# obs_hazard cols
obs_hazard_cols = []
obs_hazard_cols = group_by_name(train, 'obs_hazard', starts_with=True)
analyse_group(train[obs_hazard_cols])


There are 93 Classes in: obs_hazard
They are [nan '5 7 97' '0' '1 4 5' '1' '1 2 3 6 8' '3 97' '6' '3 5' '5 7' '4' '6 7'
 '2 3 4' '2 5' '1 3 4 7 97' '1 4' '1 2 3 4 5 6 7 97' '1 3 4 5 7' '5'
 '1 7 97' '1 2 3 5 97' '1 5 6' '7' '7 97' '5 6 97' '3 5 6' '4 6 97' '5 8'
 '6 97' '1 2 3 5 6 8 97' '4 5 6 7 97' '2 4 5 7' '3 7' '4 5 8' '1 2 4 8 97'
 '4 7' '97' '2' '1 7 8 97' '1 2 6' '1 2 5 6 7' '1 3 4 5 6' '1 5 6 8' '4 5'
 '8 97' '3' '2 7' '1 2 4 6' '2 4 8' '8' '4 7 97' '4 8' '6 8 97' '2 3 4 6'
 '1 2 3' '1 4 5 6' '1 5' '1 2 4 5 6 97' '1 4 6 8' '2 7 97' '1 2 97'
 '1 5 7' '4 5 7' '1 4 5 6 8' '6 7 97' '1 2 3 6 97' '1 6 8 97' '1 2 4'
 '1 4 6 7 97' '1 2 4 5 7' '1 3 4' '1 2 3 4' '5 97' '3 4 5 8 97' '1 2 5 7'
 '4 6 7' '5 6' '1 4 6' '1 2 5 6 97' '2 5 97' '2 5 6 97' '1 4 5 7' '4 6'
 '1 3 5 7 97' '1 7' '1 2 3 5 8' '1 4 97' '2 4 5' '1 3 4 5 97' '4 97' '1 6'
 '1 2 5 8' '4 8 97' '1 5 8']
There are 2 Classes in: obs_hazard_1
They are [nan 'No' 'Yes']
There are 2 Classes in: obs_hazard_2
They are [nan 'No' 'Yes']

In [37]:
train.drop(['obs_hazard_1', 'obs_hazard_2', 'obs_hazard_3', 'obs_hazard_4',
            'obs_hazard_5', 'obs_hazard_6', 'obs_hazard_7', 'obs_hazard_8',
            'obs_hazard_97', 'obs_hazard_0'],inplace=True,axis=1)

new_train['obs_hazard'] = train['obs_hazard']

In [38]:
# obs_lighting cols
obs_lighting_cols = []
obs_lighting_cols = group_by_name(train, 'obs_lighting', starts_with=True)
analyse_group(train[obs_lighting_cols])


There are 7 Classes in: obs_lighting_census
They are [nan 'Electricity' 'Solar' 'Gas' 'Candles' 'None' 'Other' 'Paraffin']
There are 7 Classes in: obs_lighting
They are [nan 'None' 'Electricity from mains' 'Candles' 'Gas' 'Paraffin'
 'Electricity from generator' 'Solar']
----------------------------------
We have 9 features
We have 2 categorical features
We have 7 numerical features


In [39]:
new_train['obs_lighting'] = train['obs_lighting']

In [40]:
# obs_cooking cols
obs_cooking_cols = []
obs_cooking_cols = group_by_name(train, 'obs_cooking', starts_with=True)
analyse_group(train[obs_cooking_cols])


There are 6 Classes in: obs_cooking_census
They are [nan 'Gas' 'Electricity' 'None' 'Coal or wood' 'Paraffin' 'Other']
There are 6 Classes in: obs_cooking
They are [nan 'Gas' 'Electricity from mains' 'Paraffin' 'None' 'Other' 'Solar']
----------------------------------
We have 8 features
We have 2 categorical features
We have 6 numerical features


In [41]:
new_train['obs_cooking'] = train['obs_cooking']

In [42]:
# obs_heating cols
obs_heating_cols = []
obs_heating_cols = group_by_name(train, 'obs_heating', starts_with=True)
analyse_group(train[obs_heating_cols])


There are 7 Classes in: obs_heating_census
They are [nan 'Electricity' 'Gas' 'Solar' 'None' 'Paraffin' 'Coal or wood' 'Other']
There are 7 Classes in: obs_heating
They are [nan 'None' 'Electricity from mains' 'Solar' 'Gas' 'Paraffin' 'Other'
 'Electricity from generator']
----------------------------------
We have 9 features
We have 2 categorical features
We have 7 numerical features


In [43]:
new_train['obs_heating'] = train['obs_heating']

In [44]:
obs_cols = []
new_obs_cols = []
obs_cols = group_by_name(train, 'obs', starts_with=True)
new_obs_cols = group_by_name(new_train, 'obs', starts_with=True)

In [45]:
obs_keep = ['obs_firstaid', 'obs_classrooms', 'obs_space', 'obs_area_2', 'obs_accessible', 'obs_books',
 'obs_books_age', 'obs_heating_census', 'obs_water', 'obs_waterother', 'obs_potable',
 'obs_building', 'obs_shared', 'obs_outdoor', 'obs_condition_equipment', 'obs_fence',
 'obs_gate', 'obs_access', 'obs_water_running', 'obs_electricity_working', 'obs_menu_display',
 'obs_menu_compliance', 'obs_menu_same', 'obs_material_display']

new_train[obs_keep] = train[obs_keep]

In [46]:
new_train.head(2)

Unnamed: 0,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,child_observe_total,...,obs_condition_equipment,obs_fence,obs_gate,obs_access,obs_water_running,obs_electricity_working,obs_menu_display,obs_menu_compliance,obs_menu_same,obs_material_display
0,59.0,,,,,Sometimes,Sometimes,Sometimes,Sometimes,4.0,...,,,,,,,,,,
1,60.163933,,,1st year in the programme,103.0,Sometimes,Almost never,Sometimes,Often,4.0,...,,,,,,,,,,


In [47]:
# pqa cols
pqa_cols = []
pqa_cols = group_by_name(train, 'pqa', starts_with=True)
analyse_group(train[pqa_cols])


There are 16 Classes in: pqa_class_age
They are [nan '4 5' '3 4' '3 4 5' '4 5 6' '4' '2 3 4 5' '1 2 3 4' '3 5' '1 2 3 4 5'
 '1 2 3 4 5 6' '2 3 4' '2 3' '1 5' '5' '2 3 4 5 6' '3 4 5 6']
There are 2 Classes in: pqa_class_age_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pqa_class_age_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pqa_class_age_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pqa_class_age_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pqa_class_age_5
They are [nan 'Yes' 'No']
There are 2 Classes in: pqa_class_age_6
They are [nan 'No' 'Yes']
There are 3 Classes in: pqa_environment_areas
They are [nan
 '<b>Good:</b> There are four or more learning areas arranged so children can use them, quiet and active areas separate.'
 '<b>Basic:</b> Three learning areas set out for children to play in (alone or with other children).'
 '<b>Inadequate:</b> There are no or fewer than three organised learning areas.']
There are 3 Classes in: pqa_environment_variety
They 

In [48]:
pqa_cols

['pqa_class_age',
 'pqa_class_age_1',
 'pqa_class_age_2',
 'pqa_class_age_3',
 'pqa_class_age_4',
 'pqa_class_age_5',
 'pqa_class_age_6',
 'pqa_class_assistants',
 'pqa_environment_areas',
 'pqa_environment_variety',
 'pqa_environment_appropriate',
 'pqa_environment_accessible',
 'pqa_environment_open',
 'pqa_environment_outdoor',
 'pqa_score_environment',
 'pqa_assessment_observation',
 'pqa_assessment_systematic',
 'pqa_score_assessment',
 'pqa_relationships_peers',
 'pqa_relationships_staff',
 'pqa_relationships_acknowledge',
 'pqa_relationships_discipline',
 'pqa_score_relationships',
 'pqa_curriculum_ncf',
 'pqa_curriculum_plan',
 'pqa_curriculum_balance',
 'pqa_curriculum_numeracy',
 'pqa_curriculum_literacy',
 'pqa_score_curriculum',
 'pqa_teaching_choice',
 'pqa_teaching_engagement',
 'pqa_teaching_participation',
 'pqa_teaching_questions',
 'pqa_teaching_support',
 'pqa_score_teaching',
 'pqa_class']

In [49]:
# pqa_class cols
pqa_class_cols = []
pqa_class_cols = group_by_name(train, 'pqa_class', starts_with=True)
analyse_group(train[pqa_class_cols])

There are 16 Classes in: pqa_class_age
They are [nan '4 5' '3 4' '3 4 5' '4 5 6' '4' '2 3 4 5' '1 2 3 4' '3 5' '1 2 3 4 5'
 '1 2 3 4 5 6' '2 3 4' '2 3' '1 5' '5' '2 3 4 5 6' '3 4 5 6']
There are 2 Classes in: pqa_class_age_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pqa_class_age_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pqa_class_age_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pqa_class_age_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pqa_class_age_5
They are [nan 'Yes' 'No']
There are 2 Classes in: pqa_class_age_6
They are [nan 'No' 'Yes']
There are 8 Classes in: pqa_class
They are [nan 'B' 'A' 'C' '2' 'D' 'A4' '1' 'R']
----------------------------------
We have 9 features
We have 8 categorical features
We have 1 numerical features


In [50]:
new_train['pqa_class_age'] = train['pqa_class_age']

for these columns the data has some html elements. so we'll remove these for redability

In [51]:
# pqa_environment cols
pqa_environment_cols = []
pqa_environment_cols = group_by_name(train, 'pqa_environment', starts_with=True)
analyse_group(train[pqa_environment_cols])

There are 3 Classes in: pqa_environment_areas
They are [nan
 '<b>Good:</b> There are four or more learning areas arranged so children can use them, quiet and active areas separate.'
 '<b>Basic:</b> Three learning areas set out for children to play in (alone or with other children).'
 '<b>Inadequate:</b> There are no or fewer than three organised learning areas.']
There are 3 Classes in: pqa_environment_variety
They are [nan
 '<b>Inadequate:</b> Insufficient materials in any or all areas. E.g. children have to wait for a toy or resource and have nothing to work with while waiting'
 '<b>Good:</b> Enough and variety of materials in all learning areas - more than two activities in each area and enough for all children to be occupied.'
 '<b>Basic:</b> Some materials for  all  learning areas. E.g. children don’t have to wait but there is limited choice – only one activity or only one choice.']
There are 3 Classes in: pqa_environment_appropriate
They are [nan
 '<b>Basic</b>: Most materials ma

In [52]:
train[pqa_environment_cols].head()

Unnamed: 0,pqa_environment_areas,pqa_environment_variety,pqa_environment_appropriate,pqa_environment_accessible,pqa_environment_open,pqa_environment_outdoor
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,<b>Good:</b> There are four or more learning a...,<b>Inadequate:</b> Insufficient materials in a...,<b>Basic</b>: Most materials match the develop...,<b>Good</b>: All materials are laid out so tha...,<b>Basic</b>: At least one example of open end...,<b>Inadequate</b>: Few or no outdoor equipment...


In [53]:
strip_html(train, pqa_environment_cols)

In [54]:
new_train[pqa_environment_cols] = train[pqa_environment_cols]

In [55]:
# pqa cols
pqa_score_cols = []
pqa_score_cols = group_by_name(train, 'pqa_score', starts_with=True)
analyse_group(train[pqa_score_cols])

----------------------------------
We have 5 features
We have 0 categorical features
We have 5 numerical features


In [56]:
train[pqa_score_cols].head()

Unnamed: 0,pqa_score_environment,pqa_score_assessment,pqa_score_relationships,pqa_score_curriculum,pqa_score_teaching
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,6.0,2.0,4.0,5.0,5.0


In [57]:
new_train[pqa_score_cols] = train[pqa_score_cols]

In [58]:
# pqa_assessment cols
pqa_assessment_cols = []
pqa_assessment_cols = group_by_name(train, 'pqa_assessment', starts_with=True)
analyse_group(train[pqa_assessment_cols])

There are 3 Classes in: pqa_assessment_observation
They are [nan
 '<b>Basic</b>: There is a regular  observation record/book  with some written indication of remedial activities.'
 '<b>Inadequate</b>: There is no observation record or the observation book or this is not in regular use.'
 '<b>Good</b>: There are many and varied observations of children’s progress and evidence of a range of activities to remediate difficulties/facilitate holistic development.']
There are 3 Classes in: pqa_assessment_systematic
They are [nan
 '<b>Basic</b>: Assessments are done at least twice a year, using a standard format, and filed for reference.'
 '<b>Inadequate</b>: There is no systematic child assessment.'
 '<b>Good</b>: Assessments using a standard format are updated regularly, children with difficulties assessed more often.']
----------------------------------
We have 2 features
We have 2 categorical features
We have 0 numerical features


In [59]:
strip_html(train, pqa_assessment_cols)

In [60]:
new_train[pqa_assessment_cols] = train[pqa_assessment_cols]

In [61]:
# pqa_relationships cols
pqa_relationships_cols = []
pqa_relationships_cols = group_by_name(train, 'pqa_relationships', starts_with=True)
analyse_group(train[pqa_relationships_cols])

There are 3 Classes in: pqa_relationships_peers
They are [nan
 '<b>Basic</b>: Peer interaction encouraged (e.g. free choice of who to play with), negative interactions stopped.'
 '<b>Good</b>: Staff help children develop good social behaviours, provide activities that encourage children to work together, support children who find it hard to join in.'
 '<b>Inadequate</b>: Children are discouraged from interacting (e.g. not allowed to choose who to play with, focus on individual work) and there is little or no guidance for positive peer interaction.']
There are 3 Classes in: pqa_relationships_staff
They are [nan
 '<b>Basic</b>: Friendly atmosphere, some positive interactions and response to individual children, consistent response to children’s needs (observed at least twice).'
 '<b>Inadequate</b>: Staff are not responsive to or involved with children(ignore or just give instructions) little individual attention.'
 '<b>Good</b>: Frequent positive interaction, warm contact, relaxed and pl

In [62]:
strip_html(train, pqa_relationships_cols)

In [63]:
new_train[pqa_relationships_cols] = train[pqa_relationships_cols]

In [64]:
# pqa_curriculum cols
pqa_curriculum_cols = []
pqa_curriculum_cols = group_by_name(train, 'pqa_curriculum', starts_with=True)
analyse_group(train[pqa_curriculum_cols])

There are 3 Classes in: pqa_curriculum_ncf
They are [nan
 '<b>Basic</b>: Learning programme focuses on some of the ELDAs and is mostly developmentally appropriate (4 – 5 years).'
 '<b>Inadequate</b>: No evidence that programme supports NCF aims.'
 '<b>Good</b>: Learning programme used covers all the ELDAs and activities are developmentally appropriate catering for different children’s individual needs.']
There are 3 Classes in: pqa_curriculum_plan
They are [nan
 '<b>Basic</b>: Planning books and the playroom reflect a planned and integrated approach across different learning areas and parts of the daily programme.'
 '<b>Good</b>: Plans are applied and there is evidence of taking into account children’s interests and developmental appropriateness in planned activities that are implemented.'
 '<b>Inadequate</b>: There is no evidence of planning used to organise learning activities  (that a specific plan is being followed for the day though there may be a regular schedule).']
There are 3 

In [65]:
strip_html(train, pqa_curriculum_cols)

In [66]:
new_train[pqa_curriculum_cols] = train[pqa_curriculum_cols]

In [67]:
# pqa_teaching cols
pqa_teaching_cols = []
pqa_teaching_cols = group_by_name(train, 'pqa_teaching', starts_with=True)
analyse_group(train[pqa_teaching_cols])

There are 3 Classes in: pqa_teaching_choice
They are [nan
 '<b>Basic</b>: Children make at least two choices independent of practitioner direction about where and how to carry out activities, but some materials choices and activities are practitioner directed.'
 '<b>Good</b>: Children make three or more  choices during playtime (independent of practitioner direction) about where and how to use materials and carry out activities.'
 '<b>Inadequate</b>: Staff  direct how children use materials and carry out activities (e.g. all make same things, respond with same words and actions).']
There are 3 Classes in: pqa_teaching_engagement
They are [nan
 '<b>Basic</b>: Staff  engage with one or two  children to extend their  learning using one or two different techniques (three instances observed).'
 '<b>Good</b>: Staff regularly engage to extend children’s learning using a variety of techniques (more than 3 instances observed).'
 '<b>Inadequate</b>: Staff do not engage to support and extend chil

In [68]:
strip_html(train, pqa_teaching_cols)

In [69]:
new_train[pqa_teaching_cols] = train[pqa_teaching_cols]

In [70]:
new_train.head()

Unnamed: 0,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,child_observe_total,...,pqa_curriculum_ncf,pqa_curriculum_plan,pqa_curriculum_balance,pqa_curriculum_numeracy,pqa_curriculum_literacy,pqa_teaching_choice,pqa_teaching_engagement,pqa_teaching_participation,pqa_teaching_questions,pqa_teaching_support
0,59.0,,,,,Sometimes,Sometimes,Sometimes,Sometimes,4.0,...,,,,,,,,,,
1,60.163933,,,1st year in the programme,103.0,Sometimes,Almost never,Sometimes,Often,4.0,...,,,,,,,,,,
2,69.0,,,,108.400002,Often,Often,Sometimes,Often,7.0,...,,,,,,,,,,
3,53.0,20.0,No,1st year in the programme,98.099998,Almost always,Almost always,Sometimes,Often,9.0,...,,,,,,,,,,
4,57.0,0.0,,2nd year in programme,114.0,Almost always,Almost always,Almost always,Almost always,12.0,...,Basic,Basic,Basic,Basic,Basic,Basic,Basic,Basic,Basic,Basic


In [71]:
# pra cols
pra_cols = []
pra_cols = group_by_name(train, 'pra', starts_with=True)
analyse_group(train[pra_cols])


There are 5 Classes in: pra_free_play
They are [nan 'Up to 1 hour' '30 minutes or less' 'Up to 2 hours' 'None'
 'More than 3 hours']
There are 4 Classes in: pra_free_play_outdoor
They are [nan 'Up to 1 hour' '30 minutes or less' 'Up to 2 hours' 'None']
There are 31 Classes in: pra_groupings
They are [nan '4 5' '1 2 3 4 5' '3 4' '2 5' '1 5' '1 2 4' '4' '3' '1 3' '1 2 4 5'
 '1 4 5' '1 3 4 5' '3 4 5' '1 2 5' '2 4' '3 5' '1 3 5' '2 3 4 5' '5'
 '2 4 5' '2 3 4' '1' '1 2 3 4' '1 4' '2' '1 2' '1 3 4' '1 2 3 5' '2 3 5'
 '2 3' '1 2 3']
There are 2 Classes in: pra_groupings_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_groupings_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_groupings_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_groupings_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_groupings_5
They are [nan 'Yes' 'No']
There are 3 Classes in: pra_engaged
They are [nan 'Sometime' 'Often' 'Seldom']
There are 4 Classes in: pra_agency_choice
They are [nan 'Pr

In [72]:
# pra_groupings cols
pra_groupings_cols = []
pra_groupings_cols = group_by_name(train, 'pra_groupings', starts_with=True)
analyse_group(train[pra_groupings_cols])


There are 31 Classes in: pra_groupings
They are [nan '4 5' '1 2 3 4 5' '3 4' '2 5' '1 5' '1 2 4' '4' '3' '1 3' '1 2 4 5'
 '1 4 5' '1 3 4 5' '3 4 5' '1 2 5' '2 4' '3 5' '1 3 5' '2 3 4 5' '5'
 '2 4 5' '2 3 4' '1' '1 2 3 4' '1 4' '2' '1 2' '1 3 4' '1 2 3 5' '2 3 5'
 '2 3' '1 2 3']
There are 2 Classes in: pra_groupings_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_groupings_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_groupings_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_groupings_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_groupings_5
They are [nan 'Yes' 'No']
----------------------------------
We have 6 features
We have 6 categorical features
We have 0 numerical features


In [73]:
new_train['pra_groupings'] = train['pra_groupings']

In [74]:
# pra_agency cols
pra_agency_cols = []
pra_agency_cols = group_by_name(train, 'pra_agency', starts_with=True)
analyse_group(train[pra_agency_cols])


There are 4 Classes in: pra_agency_choice
They are [nan 'Practitioner' 'Both' 'Child' 'None']
There are 4 Classes in: pra_agency_explore
They are [nan 'Both' 'Practitioner' 'Child' 'None']
There are 4 Classes in: pra_agency_questions
They are [nan 'Both' 'Child' 'Practitioner' 'None']
There are 4 Classes in: pra_agency_understand
They are [nan 'Both' 'Practitioner' 'Child' 'None']
There are 4 Classes in: pra_agency_play
They are [nan 'Both' 'Practitioner' 'Child' 'None']
There are 4 Classes in: pra_agency_learn
They are [nan 'Both' 'Practitioner' 'Child' 'None']
There are 4 Classes in: pra_agency_order
They are [nan 'Both' 'Practitioner' 'Child' 'None']
----------------------------------
We have 7 features
We have 7 categorical features
We have 0 numerical features


In [75]:
new_train[pra_agency_cols] = train[pra_agency_cols]

In [76]:
# pra_plans cols
pra_plans_cols = []
pra_plans_cols = group_by_name(train, 'pra_plans', starts_with=True)
analyse_group(train[pra_plans_cols])


There are 8 Classes in: pra_plans
They are [nan '1 3' '2' '1 2 3' '1 2' '0' '1' '2 3' '3']
There are 2 Classes in: pra_plans_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_plans_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_plans_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_plans_0
They are [nan 'No' 'Yes']
----------------------------------
We have 5 features
We have 5 categorical features
We have 0 numerical features


In [77]:
new_train['pra_plans'] = train['pra_plans']

In [78]:
# pra_class cols
pra_class_cols = []
pra_class_cols = group_by_name(train, 'pra_class', starts_with=True)
analyse_group(train[pra_class_cols])


There are 8 Classes in: pra_class_size_large
They are [nan "IT'S A MIXED CLASS, AGES 2-5."
 'THERE ARE 2 GROUPS, ONE GROUP OF 27 COMES CERTAIN DAYS, THE OTHER GROUP OF 26 ON OTHER DAYS. SO TODAY IS 100% ATTENDANCE FOR THE ONE GROUP'
 'IT IS CORRECT' 'THEY TOOK IN A HIGH NUMBER OF CHILDREN'
 'SHE SAYS THEY HAVE A LARGE NUMBER OF KIDS AND THE PRINCIPAL IS ONLY DEALING WITH 0_2YEARS ONLY PLUS ADMINISTRATION. THEY ARE ARE CATERING FOR 4 VILLAGES HENCE THE BIG NUMBER'
 'THIS PRE SCHOOL HAS ONLY ONE CLASS ROOM  BUT DUE TO COVID REGULATIONS THEY HAVE DIVIDED INTO TWO GROUPS OF ATTENDING CHILDREN'
 "EVERY ONE IS IN ONE CLASS DON'T HAVE OTHER CLASSROOMS EVERYONE IS IN ONE ROOM NOT DIVIDED."
 "THE CENTRE'S ENROLLED 60 CHILDREN IN 2021"]
There are 11 Classes in: pra_class_language
They are [nan 'isiZulu' 'English' 'Setswana' 'Sepedi' 'Tshivenda' 'Sesotho'
 'isiXhosa' 'Afrikaans' 'isiNdebele' 'Siswati' 'Xitsonga']
There are 4 Classes in: pra_class_space_small
They are [nan
 'THE  CLASSROOM IS VERY

In [79]:
pra_class_cols.remove('pra_class_space_small')
pra_class_cols.remove('pra_class_space_large')
pra_class_cols.remove('pra_class_size_large')
new_train[pqa_teaching_cols] = train[pqa_teaching_cols]

In [80]:
# pra_cohort cols
pra_cohort_cols = []
pra_cohort_cols = group_by_name(train, 'pra_cohort', starts_with=True)
analyse_group(train[pra_cohort_cols])


There are 16 Classes in: pra_cohort
They are [nan '3 4 5' '3 4' '2 3 4 5' '4 5' '4 5 6' '4' '1 2 3 4 5' '0 1 2 3 4 5'
 '2 3 4' '1 2 3 4' '0 1 2 3 4' '1 2 3 4 5 6' '0 1 2 3 4 5 6' '5 6' '2 3'
 '3 4 5 6']
There are 2 Classes in: pra_cohort_0
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_cohort_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_cohort_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_cohort_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_cohort_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_cohort_5
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_cohort_6
They are [nan 'No' 'Yes']
----------------------------------
We have 8 features
We have 8 categorical features
We have 0 numerical features


In [81]:
new_train['pra_cohort'] = train['pra_cohort']

In [82]:
# pra_plan cols
pra_plan_cols = []
pra_plan_cols = group_by_name(train, 'pra_plan', starts_with=True)

# this also groups pra_plans , so well just exclude them
for i in pra_plans_cols:
    pra_plan_cols.remove(i)
    
analyse_group(train[pra_plan_cols])


There are 21 Classes in: pra_plan_4yrs
They are [nan '1' '3' '3 4' '4' '2 3' '97' '2' '2 3 4' '1 2 3 4' '4 97' '3 97'
 '1 2 4' '1 3' '1 4' '1 97' '2 4' '1 2 97' '1 3 4' '1 2' '2 97' '2 3 97']
There are 2 Classes in: pra_plan_4yrs_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_plan_4yrs_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_plan_4yrs_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_plan_4yrs_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_plan_4yrs_97
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_plan_ncf
They are [nan 'Yes' 'No']
There are 26 Classes in: pra_plan_5yrs
They are [nan '1' '3' '4' '2' '3 4' '97' '5' '2 4' '2 3 4 5' '2 3 4' '1 2 3' '1 2'
 '2 5' '1 3' '3 97' '1 5' '2 3' '4 5' '1 2 4' '1 3 4 5' '3 5' '2 97' '1 4'
 '3 4 5' '1 97' '1 4 5']
There are 2 Classes in: pra_plan_5yrs_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_plan_5yrs_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_plan_5yrs_3
They are [nan 'No' '

In [83]:
new_train[['pra_plan_4yrs','pra_plan_5yrs','pra_plan_ncf',
          'pra_plan_approved']] = train[['pra_plan_4yrs','pra_plan_5yrs','pra_plan_ncf','pra_plan_approved']]

In [84]:
# pra cols
pra_qualification_cols = []
pra_qualification_cols = group_by_name(train, 'pra_qualification', starts_with=True)
analyse_group(train[pra_qualification_cols])


There are 26 Classes in: pra_qualification
They are [nan '0' '3 4' '3' '1' '2' '4' '97' '1 2' '1 3' '6' '5' '2 4' '3 97' '1 5'
 '2 3 4' '2 3' '1 2 3' '1 3 4' '3 4 5' '2 97' '1 6' '1 4' '3 5' '4 5' '7'
 '1 2 3 4']
There are 2 Classes in: pra_qualification_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_qualification_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_qualification_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_qualification_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_qualification_5
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_qualification_6
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_qualification_7
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_qualification_97
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_qualification_0
They are [nan 'Yes' 'No']
There are 15 Classes in: pra_qualificationother
They are [nan 'NQF LEVEL 3' 'STILL IN THE PROCESS OF COMPLETING NQF LEVEL 4: ECD'
 'N6 EDUCARE' 'BASIC COMPU

In [85]:
new_train['pra_qualification'] = train['pra_qualification']

In [86]:
# pra_ncf_trainer cols
pra_ncf_trainer_cols = []
pra_ncf_trainer_cols = group_by_name(train, 'pra_ncf_trainer', starts_with=True)
analyse_group(train[pra_ncf_trainer_cols])


There are 18 Classes in: pra_ncf_trainer
They are [nan '1 4' '1' '5' '2' '97' '4' '1 5' '4 5' '3' '2 5' '1 2 3' '3 4 5'
 '3 5 97' '2 3' '2 4' '2 97' '1 3' '1 2']
There are 2 Classes in: pra_ncf_trainer_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_ncf_trainer_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_ncf_trainer_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_ncf_trainer_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_ncf_trainer_5
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_ncf_trainer_97
They are [nan 'No' 'Yes']
There are 13 Classes in: pra_ncf_trainerother
They are [nan 'PRIVATELY FUNDED' 'ONLINE PLAY SA' 'PAID FOR PRIVATELY'
 'OXBRIDGE DISTANCE LEARNING' 'SAPPI FORESTRY'
 'ONLINE STUDYING AT MATHEW GONIWE' 'PERSONALLY PAID'
 "DON'T REMEMBER THE ORGANISATION NAME"
 'AT COLLEGE WHEN I TOOK PRACTICALS' 'PLAY SA'
 'STARTED WITH ECD FUNDING AND THEN FUNDED HERSELF' 'DSD'
 'DEPARTMENT OF HEALTH']
----------------------------------
We hav

In [87]:
new_train['pra_ncf_trainer'] = train['pra_ncf_trainer']

In [88]:
# pra_training cols
pra_training_cols = []
pra_training_cols = group_by_name(train, 'pra_training', starts_with=True)
analyse_group(train[pra_training_cols])


There are 51 Classes in: pra_training
They are [nan '1 2 3 4 5' '1 2 3 4 6' '6' '0' '5 6' '2 4 5 6' '4 6' '1 2 3 4 5 6'
 '4' '3 4 5 6' '2 3 4 5 6' '1 6' '1 4 6' '3 4 5' '3 6' '3 4' '1 3 5' '2 4'
 '4 5' '2' '1 2 4 6' '4 5 6' '2 3 4 5' '1 3' '2 4 6' '2 3 4' '2 3'
 '2 3 4 6' '3 4 6' '1 3 4 6' '1 2 3 4' '1 3 4' '2 3 6' '2 3 5 6' '1' '1 2'
 '1 3 4 5' '1 2 3 5' '1 3 4 5 6' '1 2 3 5 6' '3' '2 5 6' '3 5 6' '1 2 6'
 '5' '2 6' '1 2 4' '1 2 4 5 6' '1 2 4 5' '1 4' '1 2 3']
There are 2 Classes in: pra_training_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_training_2
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_training_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_training_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_training_5
They are [nan 'Yes' 'No']
There are 2 Classes in: pra_training_6
They are [nan 'No' 'Yes']
There are 2 Classes in: pra_training_0
They are [nan 'No' 'Yes']
----------------------------------
We have 8 features
We have 8 categorical fea

In [89]:
train[pra_training_cols].head()

Unnamed: 0,pra_training,pra_training_1,pra_training_2,pra_training_3,pra_training_4,pra_training_5,pra_training_6,pra_training_0
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,1 2 3 4 5,Yes,Yes,Yes,Yes,Yes,No,No


In [90]:
new_train['pra_training'] = train['pra_training']

In [91]:
pra_keep = ['pra_free_play', 'pra_free_play_outdoor', 'pra_engaged', 'pra_gender', 'pra_experience', 'pra_class_present',
 'pra_class_size', 'pra_class_attendance', 'pra_class_attendance_precovid', 'pra_class_language',
 'pra_job', 'pra_clearance_police', 'pra_clearance_ncp', 'pra_salary', 'pra_paid', 'pra_breadwinner',
 'pra_hhsize', 'pra_educationother',
 'pra_previous',
 'pra_learnership',
 'pra_special_training',
 'pra_special_referrals',
 'pra_online_training','pra_online_training_details',
 'pra_motivate_support',
 'pra_motivate_recognition',
 'pra_motivate_mentoring',
 'pra_shape',
 'pra_measure_rectangle_length',
 'pra_measure_rectangle_width',
 'pra_class_space',
 'pra_class_space_small',
 'pra_class_space_large',
 'pra_ind',
 'pra_language',
 'practitioner']

In [92]:
len(pra_keep)

36

In [93]:
new_train[pra_keep] = train[pra_keep]

In [94]:
new_train.head()

Unnamed: 0,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,child_observe_total,...,pra_motivate_mentoring,pra_shape,pra_measure_rectangle_length,pra_measure_rectangle_width,pra_class_space,pra_class_space_small,pra_class_space_large,pra_ind,pra_language,practitioner
0,59.0,,,,,Sometimes,Sometimes,Sometimes,Sometimes,4.0,...,,,,,,,,No,,
1,60.163933,,,1st year in the programme,103.0,Sometimes,Almost never,Sometimes,Often,4.0,...,,,,,,,,No,,
2,69.0,,,,108.400002,Often,Often,Sometimes,Often,7.0,...,,,,,,,,No,,
3,53.0,20.0,No,1st year in the programme,98.099998,Almost always,Almost always,Sometimes,Often,9.0,...,,,,,,,,Yes,,
4,57.0,0.0,,2nd year in programme,114.0,Almost always,Almost always,Almost always,Almost always,12.0,...,Agree strongly,Rectangle,4.0,5.0,20.0,,,Yes,isiZulu,Yes


In [95]:
# pri cols
pri_cols = []
pri_cols = group_by_name(train, 'pri', starts_with=True)
analyse_group(train[pri_cols])

There are 2 Classes in: pri_mobile
They are [nan 'Based at a specific location' 'Moves from place to place']
There are 2 Classes in: pri_school
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_holidays
They are [nan 'Yes' 'No']
There are 15 Classes in: pri_calc_time_open
They are [nan '07:00' '06:30' '08:00' '07:30' '07:15' '06:00' '07:45' '06:45'
 '09:00' '08:30' '06:15' '05:00' '05:45' '05:30' '08:45']
There are 27 Classes in: pri_calc_time_close
They are [nan '17:00' '15:00' '17:30' '14:00' '13:00' '13:30' '16:00' '18:00'
 '12:30' '14:30' '16:30' '18:30' '15:30' '12:00' '16:15' '17:45' '11:00'
 '13:45' '13:15' '15:15' '16:45' '12:15' '14:45' '17:15' '19:00' '22:00'
 '15:45']
There are 3 Classes in: pri_separate
They are [nan 'Yes, children are grouped by age but are using the same space'
 'Yes, children are grouped by age and divided into different rooms'
 'No, children of all ages are learning and playing together']
There are 64 Classes in: pri_language
They are [nan '2 4' '5'

There are 2 Classes in: pri_health_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_health_5
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_health_97
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_health_0
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_clinic_travelother
They are [nan 'WE HIRE SOMEONE TO TAKE US'
 "WE SOMETIMES CALL THE NURSES TO COME TO OUR ECD IF THERE'S AN EMERGENCY"]
There are 18 Classes in: pri_covid_awareness
They are [nan '1 2 4' '1 2 3 4' '3 4' '1' '2' '1 3 4' '1 2 3' '2 3 4' '1 4' '2 4'
 '1 2' '3' '1 3' '1 2 4 97' '1 4 97' '1 2 3 4 97' '2 97' '2 3']
There are 2 Classes in: pri_covid_awareness_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_awareness_2
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_awareness_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_covid_awareness_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_awareness_97
They are [nan 'No' 'Yes']
There are 33 Classes in: pri_

In [96]:
new_train[pra_keep] = train[pra_keep]

In [97]:
# pri_language cols
pri_language_cols = []
pri_language_cols = group_by_name(train, 'pri_language', starts_with=True)
analyse_group(train[pri_language_cols])

There are 64 Classes in: pri_language
They are [nan '2 4' '5' '2' '1 2 4' '2 6' '2 7' '7' '1 2' '2 8' '9' '10' '6' '2 5'
 '1 2 7' '1' '4' '11' '8' '1 2 5 8' '1 4' '5 8' '2 5 8' '2 11' '2 9'
 '2 8 11' '2 10' '2 5 6 7 10 11' '2 4 8' '2 6 10' '3' '4 5' '2 7 8'
 '2 3 6' '2 4 97' '2 4 5 6 7 8' '4 8' '1 2 8' '1 2 3' '2 6 7 8' '1 2 5'
 '2 4 6' '2 6 11' '2 4 7' '3 6' '2 5 6' '1 7' '1 2 4 7' '2 4 5' '4 5 6 11'
 '2 3' '6 11' '1 2 4 8' '2 4 5 8' '2 5 7' '7 11' '2 6 10 11' '2 10 11'
 '2 3 4 5 6 7 8 9 10' '2 4 5 7 8' '10 11' '6 7' '2 5 7 8' '2 5 6 11'
 '2 3 5 8']
There are 2 Classes in: pri_language_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_language_2
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_language_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_language_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_language_5
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_language_6
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_language_7
They are [nan 

In [98]:
new_train['pri_language'] = train['pri_language']

In [99]:
# pri_meal cols
pri_meal_cols = []
pri_meal_cols = group_by_name(train, 'pri_meal', starts_with=True)
analyse_group(train[pri_meal_cols])

There are 26 Classes in: pri_meal
They are [nan '1 2' '1 3 4' '1 2 4' '1 3' '1 2 4 5' '3' '1 2 3' '1 2 3 4' '2 4'
 '2 5' '1' '3 4' '2' '1 4 5' '2 3' '1 2 3 4 5' '0' '1 2 5' '4' '2 3 4'
 '1 4' '97' '2 4 5' '5' '1 3 97' '1 2 3 4 97']
There are 2 Classes in: pri_meal_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_meal_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_meal_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_meal_2
They are [nan 'No' 'Yes']
There are 5 Classes in: pri_meal_prep
They are [nan 'A cook employed by the Programme' 'Other' 'Practitioners'
 'Volunteers / Someone from the community' 'EPWP worker']
There are 2 Classes in: pri_meals
They are [nan 'Yes' 'No']
----------------------------------
We have 7 features
We have 7 categorical features
We have 0 numerical features


In [100]:
train[pri_meal_cols].head()

Unnamed: 0,pri_meal,pri_meal_1,pri_meal_3,pri_meal_4,pri_meal_2,pri_meal_prep,pri_meals
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,1 2,Yes,Yes,No,No,,Yes
4,1 3 4,Yes,Yes,,No,A cook employed by the Programme,Yes


In [101]:
new_train[['pri_meal', 'pri_meal_prep']] = train[['pri_meal', 'pri_meal_prep']]

In [102]:
# pri_funding cols
pri_funding_cols = []
pri_funding_cols = group_by_name(train, 'pri_funding', starts_with=True)
analyse_group(train[pri_funding_cols])

There are 16 Classes in: pri_fundingother
They are [nan 'DSD' 'NAG' 'THE CRADLE OF HOPE' 'LONGYUAN MULILO MIND PROJECT' '-1'
 'EXPERIAN' 'STIMULUS PACKAGE' 'NUTRITION' 'NO FUNDING'
 'SOCIAL DEVELOPMENT' 'NOT FUNDED' 'UKUHAMBA NABANTWANA AT KWI ECD TRUST'
 "THE ECD DOESN'T RECEIVE ANY FUNDING" 'FEES AND GOVERNMENT SUBSIDY'
 'NON-PROFIT ORGANIZATION (LUNCHBOX FUND)'
 'CWP GIVES US R800 ON AVERAGE PER MONTH']
There are 2 Classes in: pri_funding_6
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_funding_7
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_donations
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_97
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_5
They are [nan 'No' 'Yes']
There

In [103]:
new_train['pri_funding_salary'] = train['pri_funding_salary']

In [104]:
# pri_qualification cols
pri_qualification_cols = []
pri_qualification_cols = group_by_name(train, 'pri_qualification', starts_with=True)
analyse_group(train[pri_qualification_cols])

There are 36 Classes in: pri_qualification
They are [nan '3' '4' '1' '5' '0' '7' '2 3 4' '1 4' '97' '3 4' '1 4 6' '1 2' '2 3'
 '4 5' '1 5' '1 3' '6' '2 4' '3 97' '6 7' '1 5 97' '2' '4 7' '3 7' '3 5'
 '1 2 3' '1 2 3 4' '1 3 4' '2 3 4 97' '1 6' '2 3 4 5' '1 4 97' '3 4 5'
 '1 2 3 4 5' '3 6' '1 2 5']
There are 2 Classes in: pri_qualification_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_qualification_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_qualification_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_qualification_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_qualification_5
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_qualification_6
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_qualification_7
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_qualification_97
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_qualification_0
They are [nan 'No' 'Yes']
There are 10 Classes in: pri_qualificationother
They are [nan 'NQF LEVE

In [105]:
train[pri_qualification_cols]

Unnamed: 0,pri_qualification,pri_qualification_1,pri_qualification_2,pri_qualification_3,pri_qualification_4,pri_qualification_5,pri_qualification_6,pri_qualification_7,pri_qualification_97,pri_qualification_0,pri_qualificationother
0,,,,,,,,,,,
1,,,,,,,,,,,
2,,,,,,,,,,,
3,,,,,,,,,,,
4,3,No,No,Yes,No,No,No,No,No,No,
...,...,...,...,...,...,...,...,...,...,...,...
8580,3 4,No,No,Yes,Yes,No,No,No,No,No,
8581,3 4,No,No,Yes,Yes,No,No,No,No,No,
8582,4,No,No,No,Yes,No,No,No,No,No,
8583,,,,,,,,,,,


In [106]:
new_train['pri_qualification'] = train['pri_qualification']

In [107]:
# pri_network_type cols
pri_network_type_cols = []
pri_network_type_cols = group_by_name(train, 'pri_network_type', starts_with=True)
analyse_group(train[pri_network_type_cols])

There are 6 Classes in: pri_network_type
They are [nan '1' '97' '1 3' '3' '2' '1 2']
There are 2 Classes in: pri_network_type_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_network_type_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_network_type_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_network_type_97
They are [nan 'No' 'Yes']
----------------------------------
We have 5 features
We have 5 categorical features
We have 0 numerical features


In [108]:
new_train['pri_network_type'] = train['pri_network_type']

In [109]:
# pri_support cols
pri_support_provider_cols = []
pri_support_provider_cols = group_by_name(train, 'pri_support_provider', starts_with=True)
analyse_group(train[pri_support_provider_cols])

There are 22 Classes in: pri_support_provider
They are [nan '1 4' '97' '1 2' '2' '5' '1' '3' '1 3' '4' '4 5' '1 5' '1 97' '5 97'
 '3 4' '3 4 5' '1 2 97' '2 3' '3 5' '2 5' '1 3 5' '3 97' '1 3 4']
There are 2 Classes in: pri_support_provider_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_support_provider_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_support_provider_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_support_provider_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_support_provider_5
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_support_provider_97
They are [nan 'No' 'Yes']
There are 19 Classes in: pri_support_providerother
They are [nan 'KNYSNA EDUCATION TRUST' 'DEPARTMENT OF SOCIAL DEVELOPMENT'
 'SCHOOL GOVERNING BODY' 'SOCIAL WORKERS WHO WERE PART OF TRAINING'
 'PRE SCHOOL BOARD COMMITTEE OF MZOMTSHA PRE SCHOOL'
 'GROW ACCADEMY ON-LINE' 'PRINCIPALS FROM OTHER ECDS IN THE NEIGHBORHOOD.'
 'PARENTS' 'NOT SURE OF THE NAME(CORLYN)' 'CO

In [110]:
new_train['pri_support_provider'] = train['pri_support_provider']

In [111]:
# pri_money cols
pri_money_cols = []
pri_money_cols = group_by_name(train, 'pri_money', starts_with=True)
analyse_group(train[pri_money_cols])

There are 13 Classes in: pri_money
They are [nan '3' '1' '97' '1 97' '1 2' '2' '-1' '2 3' '1 3' '1 2 3' '2 3 97'
 '2 97' '3 97']
There are 2 Classes in: pri_money_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_money_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_money_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_money_97
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_money__1
They are [nan 'No' 'Yes']
There are 40 Classes in: pri_moneyother
They are [nan 'CARD TRANSACTIONS' 'EFT' 'EFT, DEBIT CARD' 'SWIPE'
 'EFT, SWIPE DEBIT CARD' 'EFT, STOP ORDERS' 'THROUGH THE BANK'
 'ONLINE BANKING' 'EFT AND SWIPING'
 "I'M NOT THE ONE MANAGING PROGRAMS MONEY" 'SWIPING' 'BY NOT OVERSPENDING'
 'DEBIT CARD'
 "MOTHER COMPANY PAYS 'AN ADVANCE' INTO PRINCIPALS ACCOUNT, WHO DRAWS THE MONEY AND BUYS EG STATIONERY"
 'CHECK' 'SWIPE CARD' 'THERE IS NO OTHER OPTION'
 'THE USE OF A TREASURE AND CHAIPERSON AND SECRETARY'
 'WE HAVE AN ACCOUNTANT'
 'ALL MONIES MUST FIRST BE APPROV

In [112]:
train[pri_money_cols]

Unnamed: 0,pri_money,pri_money_1,pri_money_2,pri_money_3,pri_money_97,pri_money__1,pri_moneyother
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,3,No,No,Yes,No,No,
...,...,...,...,...,...,...,...
8580,1,Yes,No,No,No,No,
8581,1 2,Yes,Yes,No,No,No,
8582,1 97,Yes,No,No,Yes,No,EFT
8583,,,,,,,


In [113]:
new_train['pri_money'] = train['pri_money']

In [114]:
# pri_funding_salary cols
pri_funding_salary_cols = []
pri_funding_salary_cols = group_by_name(train, 'pri_funding_salary', starts_with=True)
analyse_group(train[pri_funding_salary_cols])

There are 12 Classes in: pri_funding_salary
They are [nan '0' '97' '2 3' '3' '2' '1' 0.0 3.0 2.0 97.0 1.0 4.0]
There are 2 Classes in: pri_funding_salary_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_salary_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_salary_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_salary_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_salary_97
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_funding_salary_0
They are [nan 'Yes' 'No']
There are 8 Classes in: pri_funding_salaryother
They are [nan 'FOOD FORWARD AND LUNCH BOX' 'METHODIST CHURCH' 'DBE' 'DSD'
 'CHURCH OF SKHUKHUZA'
 'THE TRUST WILL GIVE MONEY BEFORE WE RECEIVE FUNDS TO USE IN THE MEANTIME'
 'DEPARTMENT OF EDUCATION' 'ACVV']
----------------------------------
We have 8 features
We have 8 categorical features
We have 0 numerical features


In [115]:
new_train['pri_funding_salary'] = train['pri_funding_salary']

In [116]:
# pri_clinic_travel cols
pri_clinic_travel_cols = []
pri_clinic_travel_cols = group_by_name(train, 'pri_clinic_travel', starts_with=True)
analyse_group(train[pri_clinic_travel_cols])

There are 17 Classes in: pri_clinic_travel
They are [nan '3' '4' '1' '4 5' '3 4' '2 3' '1 3' '2' '0' '1 4' '2 3 4' '97'
 '1 2 3' '1 4 5' '1 2' '2 97' '2 4']
There are 2 Classes in: pri_clinic_travel_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_clinic_travel_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_clinic_travel_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_clinic_travel_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_clinic_travel_5
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_clinic_travel_97
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_clinic_travelother
They are [nan 'WE HIRE SOMEONE TO TAKE US'
 "WE SOMETIMES CALL THE NURSES TO COME TO OUR ECD IF THERE'S AN EMERGENCY"]
----------------------------------
We have 8 features
We have 8 categorical features
We have 0 numerical features


In [117]:
new_train['pri_clinic_travel'] = train['pri_clinic_travel']

In [118]:
# pri_covid_awareness cols
pri_covid_awareness_cols = []
pri_covid_awareness_cols = group_by_name(train, 'pri_covid_awareness', starts_with=True)
analyse_group(train[pri_covid_awareness_cols])

There are 18 Classes in: pri_covid_awareness
They are [nan '1 2 4' '1 2 3 4' '3 4' '1' '2' '1 3 4' '1 2 3' '2 3 4' '1 4' '2 4'
 '1 2' '3' '1 3' '1 2 4 97' '1 4 97' '1 2 3 4 97' '2 97' '2 3']
There are 2 Classes in: pri_covid_awareness_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_awareness_2
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_awareness_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_covid_awareness_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_awareness_97
They are [nan 'No' 'Yes']
There are 5 Classes in: pri_covid_awareness_other
They are [nan 'CLASSROOMS GET SPRAYED BY SANITIZING COMPANY WHEN NEW TERM STARTS'
 'SENT LETTERS TO PARENTS ASKING FOR KIDS TO BE KEPT HOME IF THEY HAVE ANY SYMPTOMS. IF ANY CHILD IS SICK AT SCHOOL, EVERYONE GETS SENT HOME'
 'I ALSO SAW TEACHERS SANITIZING CHILDRENS HANDS IN CERTAIN SITUATIONS'
 'NO MASK NO ENTRY' 'SCREENING']
----------------------------------
We have 7 features
We have 7 categ

In [119]:
new_train['pri_covid_awareness'] = train['pri_covid_awareness']

In [120]:
# pri_covid_precautions cols
pri_covid_precautions_cols = []
pri_covid_precautions_cols = group_by_name(train, 'pri_covid_precautions', starts_with=True)
analyse_group(train[pri_covid_precautions_cols])

There are 33 Classes in: pri_covid_precautions
They are [nan '1 2 3 5' '1 2 3' '2' '1 2 3 4 5 6' '1 2 3 4 5' '2 3' '1 2 3 5 6'
 '1 2 3 4' '1 3 5' '1 2 3 6 97' '1 2' '2 3 5' '1 2 5 6' '5' '1 3 4 5 6'
 '2 3 4 5 6' '1 2 5' '1 2 3 4 6' '1 3 5 6' '2 5' '1 2 5 97' '3 97'
 '1 2 3 6' '3 4' '1 3 4' '1 3' '1 2 3 97' '1' '3 5' '2 3 97' '3' '3 4 5'
 '1 2 3 5 6 97']
There are 2 Classes in: pri_covid_precautions_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_precautions_2
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_precautions_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_precautions_4
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_covid_precautions_5
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_covid_precautions_6
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_covid_precautions_97
They are [nan 'No' 'Yes']
There are 8 Classes in: pri_covid_precautions_other
They are [nan 'SCREENS BETWEEN DESKS'
 'THERE IS A BIG SIGN UP AT THE ENT

In [121]:
new_train['pri_covid_precautions'] = train['pri_covid_precautions']

In [122]:
# pri_food_type cols
pri_food_type_cols = []
pri_food_type_cols = group_by_name(train, 'pri_food_type', starts_with=True)
analyse_group(train[pri_food_type_cols])

There are 21 Classes in: pri_food_type
They are [nan '1 2 3 4 5' '1 2 4' '1 2 3 4' '1 2' '1' '2' '2 3 4' '1 2 3' '0'
 '1 3 4 5' '1 4' '1 2 4 5' '3' '1 4 5' '1 3 4' '1 2 5' '1 3 5' '2 3' '1 5'
 '2 3 4 5' '1 3']
There are 2 Classes in: pri_food_type_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_food_type_2
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_food_type_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_food_type_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_food_type_5
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_food_type_0
They are [nan 'No' 'Yes']
----------------------------------
We have 7 features
We have 7 categorical features
We have 0 numerical features


In [123]:
new_train['pri_food_type'] = train['pri_food_type']

In [124]:
# pri_records cols
pri_records_cols = []
pri_records_cols = group_by_name(train, 'pri_records', starts_with=True)
analyse_group(train[pri_records_cols])

There are 22 Classes in: pri_records
They are [nan '3 4 5' '1 3 4 5' '1 3 4' '1 2 3 4 5' '2 4 5' '3 4' '2 3 4 5'
 '1 2 4 5' '4 5' '1 2 3 4' '4' '1 4 5' '1 2 4' '1 4' '2 3 4' '1' '1 2'
 '1 2 3 5' '1 2 3' '0' '2' '2 4']
There are 2 Classes in: pri_records_1
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_records_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_records_3
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_records_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_records_5
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_records_0
They are [nan 'No' 'Yes']
----------------------------------
We have 7 features
We have 7 categorical features
We have 0 numerical features


In [125]:
new_train['pri_records'] = train['pri_records']

In [126]:
# pri_support_provider cols
pri_support_provider_cols = []
pri_support_provider_cols = group_by_name(train, 'pri_support_provider', starts_with=True)
analyse_group(train[pri_support_provider_cols])

There are 22 Classes in: pri_support_provider
They are [nan '1 4' '97' '1 2' '2' '5' '1' '3' '1 3' '4' '4 5' '1 5' '1 97' '5 97'
 '3 4' '3 4 5' '1 2 97' '2 3' '3 5' '2 5' '1 3 5' '3 97' '1 3 4']
There are 2 Classes in: pri_support_provider_1
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_support_provider_2
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_support_provider_3
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_support_provider_4
They are [nan 'Yes' 'No']
There are 2 Classes in: pri_support_provider_5
They are [nan 'No' 'Yes']
There are 2 Classes in: pri_support_provider_97
They are [nan 'No' 'Yes']
There are 19 Classes in: pri_support_providerother
They are [nan 'KNYSNA EDUCATION TRUST' 'DEPARTMENT OF SOCIAL DEVELOPMENT'
 'SCHOOL GOVERNING BODY' 'SOCIAL WORKERS WHO WERE PART OF TRAINING'
 'PRE SCHOOL BOARD COMMITTEE OF MZOMTSHA PRE SCHOOL'
 'GROW ACCADEMY ON-LINE' 'PRINCIPALS FROM OTHER ECDS IN THE NEIGHBORHOOD.'
 'PARENTS' 'NOT SURE OF THE NAME(CORLYN)' 'CO

In [127]:
new_train['pri_support_provider'] = train['pri_support_provider']

In [128]:
 pri_keep = ['pri_mobile',
 'pri_school',
 'pri_holidays',
 'pri_days',
 'pri_time_open_hours',
 'pri_time_open_minutes',
 'pri_time_close_hours',
 'pri_time_close_minutes',
 'pri_calc_time_open',
 'pri_calc_time_close',
 'pri_year',
 'pri_separate',
 'pri_toys',
 'pri_aftercare',
 'pri_fees',
 'pri_fees_amount',
 'pri_fees_free',
 'pri_facilities',
 'pri_land',
 'pri_facilitiesother',
 'pri_landother',
 'pri_fundingother',
 'pri_bank',
 'pri_transport',
 'pri_registered_partial',
 'pri_registered_programme',
 'pri_registered_npo',
 'pri_registered_dsd',
 'pri_subsidy',
 'pri_network',
 'pri_dsd_year',
 'pri_capacity',
 'pri_reason_register_year',
 'pri_attendance_usual',
 'pri_precovid_attendance',
 'pri_kitchen',
 'pri_attendance',
 'pri_fees_exceptions',
 'pri_fees_paid_proportion',
 'pri_amount_funding_dsd',
 'pri_amount_funding_fees',
 'pri_education',
 'pri_qualificationother',
 'pri_founder',
 'pri_founderother',
 'pri_same_language',
 'pri_location',
 'pri_locationother',
 'pri_library',
 'pri_dsd_conditional',
 'pri_dsd_conditional_other',
 'pri_dsd_unregistered',
 'pri_dsd_unregistered_other',
 'pri_registered_health',
 'pri_zoning',
 'pri_registered_cipc',
 'pri_name_network_forum',
 'pri_email_network_forum',
 'pri_name_network_ngo',
 'pri_name_network_alliance',
 'pri_name_network_other',
 'pri_children_4_6_years',
 'pri_difficult_see',
 'pri_difficult_hear',
 'pri_difficult_walk',
 'pri_difficult_hold',
 'pri_difficult_communicate',
 'pri_difficult_learn',
 'pri_support_dsd',
 'pri_support_dbe',
 'pri_support_municipality',
 'pri_support_ngo',
 'pri_firstaid',
 'pri_staff_employed',
 'pri_staff_changes_reasons',
 'pri_staff_changes_reasonsother',
 'pri_covid_staff_retrench',
 'pri_covid_staff_salaries',
 'pri_funding_food',
 'pri_fees_amount_0_1',
 'pri_fees_amount_2_3',
 'pri_fees_exceptions_other',
 'pri_expense_food',
 'pri_expense_staff',
 'pri_expense_rent',
 'pri_expense_materials',
 'pri_expense_maintenance',
 'pri_expense_admin',
 'pri_expense_other',
 'pri_expenseother',
 'pri_covid_fund_applied',
 'pri_covid_fund_received',
 'pri_clinic_time',
 'pri_food_parents_breakfast',
 'pri_food_parents_morning',
 'pri_food_parents_lunch',
 'pri_food_parents_afternoon',
 'pri_food_guidance',
 'pri_food_donation',
 'pri_food_donor',
 'pri_food_donorother',
 'pri_refrigerator',
 'pri_garden',
 'pri_meal_prep',
 'pri_parents_frequency',
 'pri_parents_contact',
 'pri_parents_activities',
 'pri_support',
 'pri_support_providerother',
 'pri_support_frequency',
 'pri_fees_amount_4_6',
 'pri_internet_user',
 'pri_fees_amount_pv',
 'pri_funding_subsidy',
 'pri_meals',
 'pri_languages']

In [129]:
new_train[pri_keep] = train[pri_keep]

In [130]:
new_features = new_train.columns
new_test = test[new_features]

In [131]:
new_train['target'] = target

In [132]:
new_test.shape

(3680, 329)

In [133]:
new_train

Unnamed: 0,child_age,child_months_enrolment,child_grant,child_years_in_programme,child_height,child_observe_attentive,child_observe_concentrated,child_observe_diligent,child_observe_interested,child_observe_total,...,pri_support,pri_support_providerother,pri_support_frequency,pri_fees_amount_4_6,pri_internet_user,pri_fees_amount_pv,pri_funding_subsidy,pri_meals,pri_languages,target
0,59.000000,,,,,Sometimes,Sometimes,Sometimes,Sometimes,4.0,...,,,,,,,,,,51.500000
1,60.163933,,,1st year in the programme,103.000000,Sometimes,Almost never,Sometimes,Often,4.0,...,,,,,,,,,,55.869999
2,69.000000,,,,108.400002,Often,Often,Sometimes,Often,7.0,...,,,,,1 2 3 4,0.00000,,,,47.520000
3,53.000000,20.0,No,1st year in the programme,98.099998,Almost always,Almost always,Sometimes,Often,9.0,...,,,,,,373.65045,0.0,Yes,,58.599998
4,57.000000,0.0,,2nd year in programme,114.000000,Almost always,Almost always,Almost always,Almost always,12.0,...,Yes,,Once a month,,,106.75727,0.0,Yes,,76.599998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8580,55.000000,9.0,Yes,1st year in the programme,102.300003,Often,Often,Sometimes,Sometimes,6.0,...,Yes,,Once a month,,,181.48737,1.0,Yes,,23.469999
8581,55.000000,32.0,,2nd year in programme,102.599998,Often,Sometimes,Sometimes,Almost never,4.0,...,No,,,,,213.51454,1.0,Yes,,45.639999
8582,56.000000,45.0,,3rd year in programme,103.800003,Almost always,Almost always,Almost always,Almost always,12.0,...,Yes,,Once a year,,,960.81540,0.0,Yes,,53.290001
8583,57.000000,9.0,Yes,1st year in the programme,102.400002,Almost always,Almost always,Almost always,Almost always,12.0,...,,,,,,0.00000,1.0,Yes,,56.330002


In [134]:
# new datasets as csv
new_train.to_csv('New_Train.csv', index=False)
new_test.to_csv('New_Test.csv', index=False)