In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost

In [58]:
df = pd.read_csv('EDA_hogares_Buenos_Aires_2019.csv')
pd.set_option('display.max_columns', None)
df.columns

Index(['id', 'nhogar', 'miembro', 'comuna', 'dominio', 'edad', 'sexo',
       'parentesco_jefe', 'situacion_conyugal', 'num_miembro_padre',
       'num_miembro_madre', 'estado_ocupacional', 'cat_ocupacional',
       'calidad_ingresos_lab', 'ingreso_total_lab', 'calidad_ingresos_no_lab',
       'ingreso_total_no_lab', 'calidad_ingresos_totales', 'ingresos_totales',
       'calidad_ingresos_familiares', 'ingresos_familiares',
       'ingreso_per_capita_familiar', 'estado_educativo', 'sector_educativo',
       'nivel_actual', 'nivel_max_educativo', 'años_escolaridad',
       'lugar_nacimiento', 'afiliacion_salud', 'hijos_nacidos_vivos',
       'cantidad_hijos_nac_vivos', 'age_group'],
      dtype='object')

In [59]:
df = df.loc[:, ['ingresos_totales', 'situacion_conyugal', 'comuna', 'cat_ocupacional', 
                'sexo', 'edad', 'cantidad_hijos_nac_vivos', 'años_escolaridad']]

df.head()

Unnamed: 0,ingresos_totales,situacion_conyugal,comuna,cat_ocupacional,sexo,edad,cantidad_hijos_nac_vivos,años_escolaridad
0,6000,Soltero/a,5,No corresponde,Mujer,18,0,12
1,12000,Soltero/a,5,No corresponde,Mujer,18,0,12
2,0,Soltero/a,2,No corresponde,Varon,18,0,12
3,100000,Viudo/a,2,Asalariado,Mujer,50,2,17
4,0,Soltero/a,2,No corresponde,Varon,17,0,10


In [60]:
df2 = df[(df['edad'] >= 16) & 
         ((df['sexo'] == 'Varon') & (df['edad'] <= 65) | (df['sexo'] == 'Mujer') & (df['edad'] <= 60)) & 
         (df['ingresos_totales'] > 0)]


In [61]:
q1 = df2['ingresos_totales'].quantile(0.3)
q3 = df2['ingresos_totales'].quantile(0.7)
iqr = q3 - q1

# Define the income group cutoffs based on the IQR
low_cutoff = abs(q1 - 1.5 * iqr)
high_cutoff = q3 + 1.5 * iqr

# Create a new column for income group based on the 'ingresos_totales' column
df2['grupo_ingresos'] = pd.cut(df2['ingresos_totales'], bins=[-float('inf'), low_cutoff, high_cutoff, float('inf')],
                              labels=['ingreso bajo', 'ingreso medio', 'ingreso alto'])

cols = df2.columns.tolist()
cols = ['grupo_ingresos'] + cols[:-1]
df2 = df2[cols]

df2 = df2.drop(['ingresos_totales'], axis=1)

# Print the first 10 rows of the DataFrame with the new 'income_group' column
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['grupo_ingresos'] = pd.cut(df2['ingresos_totales'], bins=[-float('inf'), low_cutoff, high_cutoff, float('inf')],


Unnamed: 0,grupo_ingresos,situacion_conyugal,comuna,cat_ocupacional,sexo,edad,cantidad_hijos_nac_vivos,años_escolaridad
0,ingreso bajo,Soltero/a,5,No corresponde,Mujer,18,0,12
1,ingreso bajo,Soltero/a,5,No corresponde,Mujer,18,0,12
3,ingreso alto,Viudo/a,2,Asalariado,Mujer,50,2,17
5,ingreso bajo,Unido/a,10,No corresponde,Mujer,18,1,8
6,ingreso medio,Unido/a,10,Asalariado,Varon,21,0,12


In [62]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7336 entries, 0 to 14316
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   grupo_ingresos            7336 non-null   category
 1   situacion_conyugal        7336 non-null   object  
 2   comuna                    7336 non-null   int64   
 3   cat_ocupacional           7336 non-null   object  
 4   sexo                      7336 non-null   object  
 5   edad                      7336 non-null   int64   
 6   cantidad_hijos_nac_vivos  7336 non-null   int64   
 7   años_escolaridad          7308 non-null   object  
dtypes: category(1), int64(3), object(4)
memory usage: 465.8+ KB


In [63]:
df2['situacion_conyugal'].value_counts()
df2.loc[df2['situacion_conyugal'].isin(['Separado/a de unión o matrimonio', 'Divorciado/a', 'Viudo/a']), 'situacion_conyugal'] = 'Soltero/a'
df2.loc[df2['años_escolaridad'] == 'Ningun año de escolaridad aprobado', 'años_escolaridad'] = 0
df2['años_escolaridad'] = df2['años_escolaridad'].fillna(0)
df2['años_escolaridad'] = df2['años_escolaridad'].astype(int)

In [64]:
df2['situacion_conyugal'].value_counts()

situacion_conyugal
Soltero/a    3280
Unido/a      2073
Casado/a     1983
Name: count, dtype: int64

In [65]:
comuna_dummies = pd.get_dummies(df2['comuna'], prefix='comuna', prefix_sep='_').astype(int)
df2 = pd.concat([df2, comuna_dummies], axis=1)
df2.drop('comuna', axis=1, inplace=True)


In [66]:
situacion_dummies = pd.get_dummies(df2['situacion_conyugal'], prefix='situacion_conyugal', prefix_sep='_').astype(int)
df2 = pd.concat([df2, situacion_dummies], axis=1)
df2.drop('situacion_conyugal', axis=1, inplace=True)


In [67]:
df2['sexo'] = df2['sexo'].replace({'Mujer': 0, 'Varon': 1})
df2['sexo'] = df2['sexo'].astype(int)

In [68]:
ocupacion_dummies = pd.get_dummies(df2['cat_ocupacional'], prefix='cat_ocupacional', prefix_sep='_').astype(int)
df2 = pd.concat([df2, ocupacion_dummies], axis=1)
df2.drop('cat_ocupacional', axis=1, inplace=True)
df2 = df2.reset_index(drop=True)
df2.head()

Unnamed: 0,grupo_ingresos,sexo,edad,cantidad_hijos_nac_vivos,años_escolaridad,comuna_1,comuna_2,comuna_3,comuna_4,comuna_5,comuna_6,comuna_7,comuna_8,comuna_9,comuna_10,comuna_11,comuna_12,comuna_13,comuna_14,comuna_15,situacion_conyugal_Casado/a,situacion_conyugal_Soltero/a,situacion_conyugal_Unido/a,cat_ocupacional_Asalariado,cat_ocupacional_No corresponde,cat_ocupacional_Patron/empleador,cat_ocupacional_Trabajador familiar,cat_ocupacional_Trabajador por cuenta propia
0,ingreso bajo,0,18,0,12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
1,ingreso bajo,0,18,0,12,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
2,ingreso alto,0,50,2,17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
3,ingreso bajo,0,18,1,8,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0
4,ingreso medio,1,21,0,12,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0


In [69]:
df2.columns

Index(['grupo_ingresos', 'sexo', 'edad', 'cantidad_hijos_nac_vivos',
       'años_escolaridad', 'comuna_1', 'comuna_2', 'comuna_3', 'comuna_4',
       'comuna_5', 'comuna_6', 'comuna_7', 'comuna_8', 'comuna_9', 'comuna_10',
       'comuna_11', 'comuna_12', 'comuna_13', 'comuna_14', 'comuna_15',
       'situacion_conyugal_Casado/a', 'situacion_conyugal_Soltero/a',
       'situacion_conyugal_Unido/a', 'cat_ocupacional_Asalariado',
       'cat_ocupacional_No corresponde', 'cat_ocupacional_Patron/empleador',
       'cat_ocupacional_Trabajador familiar',
       'cat_ocupacional_Trabajador por cuenta propia'],
      dtype='object')

In [70]:
#x = df2.iloc[:, 1:] 
x = df2[['edad', 'años_escolaridad', 'cantidad_hijos_nac_vivos', 'sexo', 'cat_ocupacional_Patron/empleador']]
y = df2.iloc[:, 0] 

In [71]:
df2['grupo_ingresos'].value_counts()

grupo_ingresos
ingreso medio    4729
ingreso bajo     2005
ingreso alto      602
Name: count, dtype: int64

In [72]:
from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder object
label_encoder = LabelEncoder()

# Encode the categorical target variable as integers
y = label_encoder.fit_transform(y)

In [73]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Define the resampling strategy
over = SMOTE(sampling_strategy='auto')
under = RandomUnderSampler(sampling_strategy='majority')

# Resample the data
x_resampled, y_resampled = over.fit_resample(x, y)
x_resampled, y_resampled = under.fit_resample(x_resampled, y_resampled)

# Check the class distribution after resampling
print(pd.Series(y_resampled).value_counts())

0    4729
1    4729
2    4729
Name: count, dtype: int64


In [74]:
from sklearn.model_selection import train_test_split
# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)

# Split train set into train and validation
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.125, random_state=42)

print("Training set shape:", x_train.shape, y_train.shape)
print("Validation set shape:", x_val.shape, y_val.shape)
print("Test set shape:", x_test.shape, y_test.shape)


Training set shape: (9930, 5) (9930,)
Validation set shape: (1419, 5) (1419,)
Test set shape: (2838, 5) (2838,)


In [75]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rf_Model = RandomForestClassifier()
f1_scores = cross_val_score(rf_Model, x_train, y_train, cv=5, scoring='f1_macro')

# Print the mean F1 score rounded to 5 decimal places
print("Random Forest F1 score: ", np.round(np.mean(f1_scores), 5))

Random Forest F1 score:  0.66592


In [76]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize a Gradient Boosting classification model
gbc = GradientBoostingClassifier()

# Calculate the F1 score using cross-validation
f1_scores = cross_val_score(gbc, x_train, y_train, cv=5, scoring='f1_macro')

# Print the mean F1 score rounded to 5 decimal places
print("Gradient Boosting F1 score: ", np.round(np.mean(f1_scores), 5))

Gradient Boosting F1 score:  0.63758


In [77]:
xgb = xgboost.XGBClassifier()

# Calculate the F1 score using cross-validation
f1_scores = cross_val_score(xgb, x_train, y_train, cv=5, scoring='f1_macro')

# Print the mean F1 score rounded to 5 decimal places
print("XGBoost F1 score: ", np.round(np.mean(f1_scores), 5))

XGBoost F1 score:  0.65989


In [78]:
rf_Model.fit(x_train, y_train)
gbc.fit(x_train, y_train)
xgb.fit(x_train, y_train)

rf_yhat = rf_Model.predict(x_test)
gbc_yhat = gbc.predict(x_test)
xgb_yhat = xgb.predict(x_test)

In [79]:
print("Random forest classifier F1 score: %.2f" % f1_score(y_test, rf_yhat, average='macro'))
print("Gradient booster classifier F1 score: %.2f" % f1_score(y_test, gbc_yhat, average='macro'))
print("XGBoost classifier F1 score: %.2f" % f1_score(y_test, xgb_yhat, average='macro'))

Random forest classifier F1 score: 0.68
Gradient booster classifier F1 score: 0.65
XGBoost classifier F1 score: 0.68


In [80]:
import neptune
run = neptune.init_run(
    project="ivanv21/modelo-buenos-aires",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlN2NhNmY0YS00MjY3LTQ5MGUtOWY3OC1iM2VmNzZhMmQ2MjUifQ==",
) 


https://app.neptune.ai/ivanv21/modelo-buenos-aires/e/BA-20


In [81]:
def objective(trial):
    # Set hyperparameters to search
    params = {
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'multi:softmax',
        'num_class': len(np.unique(y_train)),
        'n_jobs': -1,
        'verbosity': 0,
        'random_state': 42
    }
    
    # Train XGB classifier with hyperparameters
    xgb_clf = xgboost.XGBClassifier(**params)
    
    # Use cross-validation to estimate performance on validation set
    f1_scores = cross_val_score(xgb_clf, x_train, y_train, cv=5, scoring='f1_weighted')
    f1 = f1_scores.mean()
    
    # Log F1 score and hyperparameters to Neptune
    run[f'trial_{trial.number}/params'] = params
    run[f'trial_{trial.number}/f1'] = f1
    
    return f1  

In [82]:
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print best hyperparameters and F1 score
print('Best hyperparameters: ', study.best_params)
print('Best F1 score: ', study.best_value)

[32m[I 2023-04-20 10:30:04,221][0m A new study created in memory with name: no-name-2714ef52-035b-4d72-bb3d-1bff82e38394[0m
[32m[I 2023-04-20 10:30:05,489][0m Trial 0 finished with value: 0.6371775438742701 and parameters: {'max_depth': 6, 'learning_rate': 0.05260792174706722, 'subsample': 0.5711834120215897, 'colsample_bytree': 0.5547560890832799, 'gamma': 0.03599943011444362, 'min_child_weight': 4}. Best is trial 0 with value: 0.6371775438742701.[0m
[32m[I 2023-04-20 10:30:06,815][0m Trial 1 finished with value: 0.6447903602125966 and parameters: {'max_depth': 7, 'learning_rate': 0.07583581884301202, 'subsample': 0.9966855625288242, 'colsample_bytree': 0.7212191351353949, 'gamma': 0.12629375879720162, 'min_child_weight': 7}. Best is trial 1 with value: 0.6447903602125966.[0m
[32m[I 2023-04-20 10:30:08,113][0m Trial 2 finished with value: 0.6356928293720603 and parameters: {'max_depth': 6, 'learning_rate': 0.029296285202684746, 'subsample': 0.8391362491514214, 'colsample_by

Best hyperparameters:  {'max_depth': 9, 'learning_rate': 0.05652032524118292, 'subsample': 0.9403063280281854, 'colsample_bytree': 0.8291547587295084, 'gamma': 0.14646520293524967, 'min_child_weight': 1}
Best F1 score:  0.6678584366424664


In [83]:
best_params = study.best_params
best_f1 = study.best_value

run['best_params'] = best_params
run['best_f1'] = best_f1

run.stop()

Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 43 operations to synchronize with Neptune. Do not kill this process.
All 43 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/ivanv21/modelo-buenos-aires/e/BA-20/metadata


In [84]:
x_train_val = np.concatenate((x_train, x_val))
y_train_val = np.concatenate((y_train, y_val))

xgb_opt = xgboost.XGBClassifier(**best_params)
xgb_opt.fit(x_train_val, y_train_val)

In [85]:
x_val_test = np.concatenate((x_val, x_test))
y_val_test = np.concatenate((y_val, y_test))
xgb_opt_yhat = xgb_opt.predict(x_val_test)

print("Optimized XGBoost classifier F1 score: %.2f" % f1_score(y_val_test, xgb_opt_yhat, average='macro'))

Optimized XGBoost classifier F1 score: 0.70


In [86]:
import pickle
with open('xgb_opt_model.pkl', 'wb') as f:
    pickle.dump(xgb_opt, f)

In [87]:
x_train.dtypes

edad                                int64
años_escolaridad                    int32
cantidad_hijos_nac_vivos            int64
sexo                                int32
cat_ocupacional_Patron/empleador    int32
dtype: object