In [6]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [7]:
def one_hot_to_label_encoding(data, columns, new_column_name = 'new_col', drop_columns = False):
    data[new_column_name] = sum([(idx + 1) * data[col_name] for idx, col_name in enumerate(columns)])
    if drop_columns:
        data = data.drop(columns, axis=1)
    return data

In [8]:
def filter_by_name_start(data, name_start):
    filtered_columns = data.columns[data.columns.str.startswith(name_start)]
    return filtered_columns

## 1. Data preprocessing
### Read data

In [9]:
data = pd.read_csv('data/train.csv')

### Fill missing data

In [10]:
data['meaneduc'] = data['meaneduc'].fillna(data['meaneduc'].mean())

In [11]:
assert data['meaneduc'].isna().any() == False

In [12]:
data = data.drop(['v2a1', 'v18q1', 'rez_esc'], axis = 1)

### Feature engineering

In [13]:
data['edjefe'] = data['edjefe'].replace({'no': 0, 'yes': 1}).astype(float)
data['edjefa'] = data['edjefa'].replace({'no': 0, 'yes': 1}).astype(float)
data['dependency'] = data['dependency'].replace({'no': 0, 'yes': 1}).astype(float)

In [14]:
data['mobilephone_per_person'] = np.where(data['qmobilephone'] > 0, data['qmobilephone'] / data['r4t2'], 0)
data['people_per_room'] = data['rooms'] / data['tamviv']
data['people_per_bedroom'] = data['bedrooms'] / data['tamviv']
data['paying_rent'] = np.where(
    (data['tipovivi2'] == 1) | (data['tipovivi3'] == 1) | (data['tipovivi4'] == 1),
    1, 0)

In [15]:
data = data.set_index('Id')

In [16]:
data = data.drop(['idhogar', 'rooms', 'bedrooms', 'qmobilephone', 'mobilephone', 'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5',
                 'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned',
                 'agesq'], axis = 1)

### Label encoder
#### Categorical attributes

In [17]:
outside_material_cols = filter_by_name_start(data, 'pared')
floor_material_cols = filter_by_name_start(data, 'piso')
roof_material_cols = filter_by_name_start(data, 'techo')
water_provision_cols = filter_by_name_start(data, 'abastagua')
electricity_cols = data[['public', 'planpri', 'noelec', 'coopele']]
waste_water_connection_cols = filter_by_name_start(data, 'sanitario')
energy_source_for_cooking_cols = filter_by_name_start(data, 'energcocinar')
rubbish_disposal_cols = filter_by_name_start(data, 'elimbasu')
marital_status_cols = filter_by_name_start(data, 'estadocivil')
family_relation_cols = filter_by_name_start(data, 'parentesco')
level_of_education_cols = filter_by_name_start(data, 'instlevel')
region_cols = filter_by_name_start(data, 'lugar')
walls_condition_cols = filter_by_name_start(data, 'epared')
roof_condition_cols = filter_by_name_start(data, 'etecho')
floor_condition_cols = filter_by_name_start(data, 'eviv')

In [18]:
data = one_hot_to_label_encoding(data, outside_material_cols, new_column_name = 'outside_material', drop_columns = True)
data = one_hot_to_label_encoding(data, floor_material_cols, new_column_name = 'floor_material', drop_columns = True)
data = one_hot_to_label_encoding(data, roof_material_cols, new_column_name = 'roof_material', drop_columns = True)
data = one_hot_to_label_encoding(data, water_provision_cols, new_column_name = 'water_provisio', drop_columns = True)
data = one_hot_to_label_encoding(data, electricity_cols, new_column_name = 'electricity', drop_columns = True)
data = one_hot_to_label_encoding(data, waste_water_connection_cols, new_column_name = 'waste_water_connection', drop_columns = True)
data = one_hot_to_label_encoding(data, energy_source_for_cooking_cols, new_column_name = 'energy_for_cooking', drop_columns = True)
data = one_hot_to_label_encoding(data, rubbish_disposal_cols, new_column_name = 'rubbish_disposal', drop_columns = True)
data = one_hot_to_label_encoding(data, marital_status_cols, new_column_name = 'marital_status', drop_columns = True)
data = one_hot_to_label_encoding(data, family_relation_cols, new_column_name = 'family_relation', drop_columns = True)
data = one_hot_to_label_encoding(data, level_of_education_cols, new_column_name = 'level_of_education', drop_columns = True)
data = one_hot_to_label_encoding(data, region_cols, new_column_name = 'region', drop_columns = True)
data = one_hot_to_label_encoding(data, walls_condition_cols, new_column_name = 'walls_condition', drop_columns = True)
data = one_hot_to_label_encoding(data, roof_condition_cols, new_column_name = 'roof_condition', drop_columns = True)
data = one_hot_to_label_encoding(data, floor_condition_cols, new_column_name = 'floor_condition', drop_columns = True)

In [19]:
data = data.drop(['r4h1', 'r4h2', 'r4m1', 'r4m2', 'r4t1', 'r4t2', 'r4t3',
                  'male', 'female', 'hogar_total'], axis = 1)

In [20]:
numerical_columns = ['r4h3', 'r4m3', 'tamhog', 'tamhog', 'escolari', 'hhsize', 'hogar_nin', 'hogar_adul', 'hogar_mayor', 'dependency', 
                    'edjefe', 'edjefa', 'meaneduc', 'overcrowding', 'age', 'mobilephone_per_person', 'people_per_room', 'people_per_bedroom']
categorical_columns = data.columns[~data.columns.isin(numerical_columns)]

In [21]:
for cat_col in categorical_columns:
    data[cat_col] = data[cat_col].astype('category')

#### Target

In [22]:
labels_original = ['extreme poverty', 'moderate poverty', 'vulnerable households', 'non vulnerable households']

In [23]:
target = data['Target']
attributes = data.drop(['Target'], axis = 1)

In [24]:
label_encoder = LabelEncoder().fit(target.unique())
target_encoded = label_encoder.transform(target)

### Train test split

The data is split 80/20 for train and test. Because of the imbalance of the classes stratification is applied.

In [25]:
train_attributes, test_attributes, train_target, test_target = train_test_split(attributes, target_encoded, train_size = 0.8, stratify = target_encoded, random_state=13)
np.bincount(test_target), np.bincount(train_target)

(array([ 151,  319,  242, 1200]), array([ 604, 1278,  967, 4796]))

### Scaler

In [26]:
scaler = MinMaxScaler().fit(train_attributes)

In [27]:
train_attributes_scaled = scaler.transform(train_attributes)
test_attributes_scaled = scaler.transform(test_attributes)

In [28]:
(train_attributes_scaled.max(), train_attributes_scaled.min())

(1.0000000000000002, 0.0)

### Feature selection
#### Mutual Information

After a test data is set aside, a feature selection using mutual information classification will be applied. As seen from the result below, the highest value is only 0.158925, which means that the features left have low dependencies. Because of that features won't be dropped.

In [34]:
mutual_info = mutual_info_classif(train_attributes_scaled, train_target)
mutual_info = pd.Series(mutual_info)

In [33]:
mutual_info.sort_values(ascending = False)

19    0.158925
16    0.102031
27    0.093842
28    0.080420
20    0.076426
13    0.066263
9     0.065355
26    0.061504
42    0.058973
44    0.058612
17    0.053141
40    0.053069
30    0.047517
18    0.046600
11    0.046234
31    0.040051
4     0.033632
43    0.031303
8     0.026902
7     0.025700
14    0.025397
37    0.023296
21    0.021160
10    0.020018
6     0.019501
36    0.019229
33    0.016460
5     0.015535
25    0.015124
15    0.014533
38    0.014469
23    0.014149
41    0.014080
22    0.013770
24    0.011015
39    0.009420
0     0.009280
2     0.007402
35    0.005621
1     0.004972
3     0.003917
32    0.002247
34    0.000000
29    0.000000
12    0.000000
dtype: float64