In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


# Base XGBoost model

## 1. Data preprocessing

### Read data

In [2]:
data = pd.read_csv('data/train.csv')

### Fill missing data

In [3]:
data['meaneduc'] = data['meaneduc'].fillna(data['meaneduc'].mean())

In [4]:
assert data['meaneduc'].isna().any() == False

In [5]:
data = data.drop(['v2a1', 'v18q1', 'rez_esc'], axis = 1)

### Feature selection

In [6]:
data['edjefe'] = data['edjefe'].replace({'no': 0, 'yes': 1}).astype(float)
data['edjefa'] = data['edjefa'].replace({'no': 0, 'yes': 1}).astype(float)
data['dependency'] = data['dependency'].replace({'no': 0, 'yes': 1}).astype(float)

In [7]:
data = data.drop(['Id', 'idhogar', 'hacapo', 'hacdor', 'r4h1', 'r4h2', 'r4h3', 'r4m1', 'r4m2', 'r4m3', 'r4t1', 'r4t2', 'r4t3', 'tamhog', 
                  'hhsize', 'male', 'female', 'parentesco2', 'parentesco3', 'parentesco4', 'parentesco5', 'parentesco6', 'parentesco7',
                  'parentesco8', 'parentesco9', 'parentesco10', 'parentesco11', 'parentesco12', 'hogar_nin', 'hogar_adul', 'hogar_mayor',
                  'hogar_total', 'mobilephone', 'SQBescolari', 'SQBage', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin', 'SQBovercrowding',
                  'SQBdependency', 'SQBmeaned', 'agesq'], axis = 1)

### Label encoder

In [8]:
target = data['Target']
attributes = data.drop(['Target'], axis = 1)

In [9]:
label_encoder = LabelEncoder().fit(target.unique())

In [10]:
target_encoded = label_encoder.transform(target)

### Train test split

In [11]:
train_attributes, test_attributes, train_target, test_target = train_test_split(attributes, target_encoded, train_size = 0.8, random_state=13, stratify = target_encoded)

In [12]:
np.bincount(test_target), np.bincount(train_target)

(array([ 151,  319,  242, 1200]), array([ 604, 1278,  967, 4796]))

### Scaler

In [13]:
scaler = MinMaxScaler().fit(train_attributes)

In [14]:
train_attributes_scaled = scaler.transform(train_attributes)
test_attributes_scaled = scaler.transform(test_attributes)

In [15]:
assert train_attributes_scaled.min() == 0.0 and train_attributes_scaled.max() == 1.0

## 2. Base model

For the base model only and objective is set as logistic regresion, so the task is defined as classification.

In [16]:
xgb_model = xgb.XGBClassifier(objective ='reg:logistic', use_label_encoder=False, seed = 13)

In [17]:
xgb_model.fit(train_attributes_scaled, train_target)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=13, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              seed=13, subsample=1, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)

In [18]:
predictions = xgb_model.predict(train_attributes_scaled)

In [19]:
print(classification_report(train_target, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       604
           1       0.99      0.99      0.99      1278
           2       1.00      0.95      0.97       967
           3       0.99      1.00      0.99      4796

    accuracy                           0.99      7645
   macro avg       0.99      0.98      0.99      7645
weighted avg       0.99      0.99      0.99      7645



The base model shows very low bias on the train set, although a poor generalization is expected variation. The results are relatively close for the classes, despite the fact that they have very different support. From this can be concluded that the main goal at the hyperparameters' tuning step is to decrease the possible variance, while keeping the bias as low as possible.