In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [2]:
def one_hot_to_label_encoding(data, columns, new_column_name = 'new_col', drop_columns = False):
    data[new_column_name] = sum([(idx + 1) * data[col_name] for idx, col_name in enumerate(columns)])
    if drop_columns:
        data = data.drop(columns, axis=1)
    return data

In [3]:
def filter_by_name_start(data, name_start):
    filtered_columns = data.columns[data.columns.str.startswith(name_start)]
    return filtered_columns

## 1. Data preprocessing
### Read data

In [4]:
data = pd.read_csv('data/train.csv')

### Fill missing data

In [5]:
data['meaneduc'] = data['meaneduc'].fillna(data['meaneduc'].mean())

In [6]:
assert data['meaneduc'].isna().any() == False

In [7]:
data = data.drop(['v2a1', 'v18q1', 'rez_esc'], axis = 1)

### Feature engineering

In [8]:
data['edjefe'] = data['edjefe'].replace({'no': 0, 'yes': 1}).astype(float)
data['edjefa'] = data['edjefa'].replace({'no': 0, 'yes': 1}).astype(float)
data['dependency'] = data['dependency'].replace({'no': 0, 'yes': 1}).astype(float)

In [9]:
data['mobilephone_per_person'] = np.where(data['qmobilephone'] > 0, data['qmobilephone'] / data['r4t2'], 0)
data['people_per_room'] = data['rooms'] / data['tamviv']
data['people_per_bedroom'] = data['bedrooms'] / data['tamviv']
data['paying_rent'] = np.where(
    (data['tipovivi2'] == 1) | (data['tipovivi3'] == 1) | (data['tipovivi4'] == 1),
    1, 0)

In [10]:
data = data.set_index('Id')

In [11]:
data = data.drop(['idhogar', 'rooms', 'bedrooms', 'qmobilephone', 'mobilephone', 'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5',
                 'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned',
                 'agesq'], axis = 1)

### Label encoder
#### Categorical attributes

In [12]:
outside_material_cols = filter_by_name_start(data, 'pared')
floor_material_cols = filter_by_name_start(data, 'piso')
roof_material_cols = filter_by_name_start(data, 'techo')
water_provision_cols = filter_by_name_start(data, 'abastagua')
electricity_cols = data[['public', 'planpri', 'noelec', 'coopele']]
waste_water_connection_cols = filter_by_name_start(data, 'sanitario')
energy_source_for_cooking_cols = filter_by_name_start(data, 'energcocinar')
rubbish_disposal_cols = filter_by_name_start(data, 'elimbasu')
marital_status_cols = filter_by_name_start(data, 'estadocivil')
family_relation_cols = filter_by_name_start(data, 'parentesco')
level_of_education_cols = filter_by_name_start(data, 'instlevel')
region_cols = filter_by_name_start(data, 'lugar')
walls_condition_cols = filter_by_name_start(data, 'epared')
roof_condition_cols = filter_by_name_start(data, 'etecho')
floor_condition_cols = filter_by_name_start(data, 'eviv')

In [13]:
data = one_hot_to_label_encoding(data, outside_material_cols, new_column_name = 'outside_material', drop_columns = True)
data = one_hot_to_label_encoding(data, floor_material_cols, new_column_name = 'floor_material', drop_columns = True)
data = one_hot_to_label_encoding(data, roof_material_cols, new_column_name = 'roof_material', drop_columns = True)
data = one_hot_to_label_encoding(data, water_provision_cols, new_column_name = 'water_provisio', drop_columns = True)
data = one_hot_to_label_encoding(data, electricity_cols, new_column_name = 'electricity', drop_columns = True)
data = one_hot_to_label_encoding(data, waste_water_connection_cols, new_column_name = 'waste_water_connection', drop_columns = True)
data = one_hot_to_label_encoding(data, energy_source_for_cooking_cols, new_column_name = 'energy_for_cooking', drop_columns = True)
data = one_hot_to_label_encoding(data, rubbish_disposal_cols, new_column_name = 'rubbish_disposal', drop_columns = True)
data = one_hot_to_label_encoding(data, marital_status_cols, new_column_name = 'marital_status', drop_columns = True)
data = one_hot_to_label_encoding(data, family_relation_cols, new_column_name = 'family_relation', drop_columns = True)
data = one_hot_to_label_encoding(data, level_of_education_cols, new_column_name = 'level_of_education', drop_columns = True)
data = one_hot_to_label_encoding(data, region_cols, new_column_name = 'region', drop_columns = True)
data = one_hot_to_label_encoding(data, walls_condition_cols, new_column_name = 'walls_condition', drop_columns = True)
data = one_hot_to_label_encoding(data, roof_condition_cols, new_column_name = 'roof_condition', drop_columns = True)
data = one_hot_to_label_encoding(data, floor_condition_cols, new_column_name = 'floor_condition', drop_columns = True)

In [14]:
data = data.drop(['r4h1', 'r4h2', 'r4m1', 'r4m2', 'r4t1', 'r4t2', 'r4t3',
                  'male', 'female', 'hogar_total'], axis = 1)

In [15]:
numerical_columns = ['r4h3', 'r4m3', 'tamhog', 'tamhog', 'escolari', 'hhsize', 'hogar_nin', 'hogar_adul', 'hogar_mayor', 'dependency', 
                    'edjefe', 'edjefa', 'meaneduc', 'overcrowding', 'age', 'mobilephone_per_person', 'people_per_room', 'people_per_bedroom']
categorical_columns = data.columns[~data.columns.isin(numerical_columns)]

In [16]:
for cat_col in categorical_columns:
    data[cat_col] = data[cat_col].astype('category')

#### Target

In [17]:
labels_original = ['extreme poverty', 'moderate poverty', 'vulnerable households', 'non vulnerable households']

In [18]:
target = data['Target']
attributes = data.drop(['Target'], axis = 1)

In [19]:
label_encoder = LabelEncoder().fit(target.unique())
target_encoded = label_encoder.transform(target)

### Train test split

The data is split 80/20 for train and test. Because of the imbalance of the classes stratification is applied.

In [20]:
train_attributes, test_attributes, train_target, test_target = train_test_split(attributes, target_encoded, train_size = 0.8, stratify = target_encoded, random_state=13)
np.bincount(test_target), np.bincount(train_target)

(array([ 151,  319,  242, 1200]), array([ 604, 1278,  967, 4796]))

### Scaler

In [21]:
scaler = MinMaxScaler().fit(train_attributes)

In [22]:
train_attributes_scaled = scaler.transform(train_attributes)
test_attributes_scaled = scaler.transform(test_attributes)

In [23]:
(train_attributes_scaled.max(), train_attributes_scaled.min())

(1.0000000000000002, 0.0)

### Feature selection
#### Mutual Information

After a test data is set aside, a feature selection using mutual information classification will be applied. As seen from the result below, the highest value is only 0.158925, which means that the features left have low dependencies. Because of that features won't be dropped.

In [24]:
mutual_info = mutual_info_classif(train_attributes_scaled, train_target)
mutual_info = pd.Series(mutual_info)

In [25]:
mutual_info.sort_values(ascending = False)

19    0.152779
16    0.098945
27    0.096100
28    0.073452
13    0.065944
20    0.062964
26    0.062818
17    0.058873
40    0.057215
9     0.055676
11    0.054685
30    0.046296
44    0.043863
42    0.043748
31    0.040662
10    0.038187
4     0.036995
43    0.033803
18    0.033435
14    0.031943
38    0.028726
8     0.027758
5     0.024909
37    0.024599
21    0.024546
35    0.023256
41    0.022888
36    0.019760
22    0.019425
25    0.018718
0     0.017326
7     0.017251
24    0.016604
6     0.016209
29    0.014572
23    0.010186
39    0.008241
3     0.006733
33    0.005652
2     0.003159
32    0.000000
34    0.000000
1     0.000000
15    0.000000
12    0.000000
dtype: float64

## 2. Model

A XGB classifier is used for the task. The tuned parameters are as follows:
- `gamma` (Minimum loss reduction required to make a further partition on a leaf node of the tree.) - 1
- `min_child_weight` (Minimum sum of instance weight (hessian) needed in a child.) - 5
- `learning_rate` - 0.08
- `max_depth` - 7
- `n_estimators` - 300
- `max_delta_step` (Maximum delta step we allow each leaf output to be.) - 4

In [26]:
tuned_model = xgb.XGBClassifier(objective = 'multi:softmax',
                                num_class=4,
                                gamma = 1,
                                min_child_weight = 5,
                                learning_rate = 0.08,
                                max_depth = 7,
                                n_estimators = 300,
                                max_delta_step = 4,
                                eval_metric = 'merror',
                                use_label_encoder = False,
                                seed = 13)

In [27]:
tuned_model.fit(train_attributes_scaled, train_target)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='merror', gamma=1, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.08, max_delta_step=4,
              max_depth=7, min_child_weight=5, missing=nan,
              monotone_constraints='()', n_estimators=300, n_jobs=8,
              num_class=4, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=13, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=None, seed=13, subsample=1, tree_method='exact', ...)

### Evaluation

In [28]:
print(classification_report(train_target, tuned_model.predict(train_attributes_scaled)))

              precision    recall  f1-score   support

           0       0.99      0.94      0.97       604
           1       0.98      0.94      0.96      1278
           2       0.97      0.92      0.95       967
           3       0.97      1.00      0.99      4796

    accuracy                           0.97      7645
   macro avg       0.98      0.95      0.96      7645
weighted avg       0.97      0.97      0.97      7645



In [29]:
print(classification_report(test_target, tuned_model.predict(test_attributes_scaled)))

              precision    recall  f1-score   support

           0       0.90      0.76      0.82       151
           1       0.89      0.80      0.84       319
           2       0.92      0.71      0.80       242
           3       0.91      0.99      0.95      1200

    accuracy                           0.90      1912
   macro avg       0.90      0.82      0.85      1912
weighted avg       0.90      0.90      0.90      1912



As seen from the results below, the recall is lower that the precision for more of the classes. This means that the model gives some false negative. 

Lowest f-1 score is present for class 2 (vulnerable households), despite the fact that it's not the class with least samples, which leads to the conclusion that it's the hardest to distinguish. 

The highest f-1 score of 95% is for class 4 (non-vulnerable households), which also has most of the records in the dataset.