Feature engineering

In [20]:
#Import libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from scipy.stats import randint

from src.utils.split_data import train_val_split, split_features

In [23]:
#Load the data
DATA_PATH = "../../data/processed/"

df = pd.read_csv(DATA_PATH+"processed_data.csv")

In [24]:
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_87,feature_118,feature_119,feature_139,feature_144,feature_147,feature_158,feature_159,feature_195,target
0,0.661364,1.0,0.0,0.000120,0.310606,0.009740,0.180457,0.752510,0.001050,0.500000,...,2,1,0,0,3,3,5,11,4,0
1,0.672847,0.5,0.0,0.000131,0.327720,0.035065,0.685714,0.777475,0.003627,0.000000,...,0,0,0,0,1,1,6,6,2,0
2,0.728351,0.5,0.0,0.000068,0.245778,0.001299,0.571429,0.712884,0.000604,0.000000,...,1,1,1,4,3,4,6,11,4,0
3,0.575738,0.5,0.0,0.000000,0.259794,0.011688,0.228571,0.624854,0.000986,0.000000,...,1,1,1,1,2,5,0,5,5,0
4,0.505637,0.0,0.0,0.000049,0.332217,0.001948,0.380914,0.847067,0.001845,0.666667,...,1,1,5,7,3,2,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132927,0.532182,1.0,0.0,0.000031,0.203233,0.001299,0.000000,0.757268,0.000000,0.500000,...,0,0,2,1,1,5,3,6,2,1
132928,0.607332,1.0,0.0,0.000106,0.404031,0.023377,0.285714,0.873888,0.002322,0.000000,...,2,0,5,0,1,1,5,6,4,1
132929,0.677294,0.0,0.0,0.000000,0.200944,0.001948,0.190514,0.740053,0.001559,0.500000,...,0,0,3,7,3,4,0,13,2,1
132930,0.607917,1.0,0.0,0.001009,0.278486,0.012987,0.380914,0.754901,0.001145,1.000000,...,0,2,4,7,3,0,1,7,1,1


In [25]:
#Split the train,and validation set
train_set, val_set = train_val_split(df)

In [26]:
X_train, y_train = split_features(train_set, "target")
X_val, y_val = split_features(val_set, "target")

In [27]:
#A random forest is trained for the selection of characteristics
clf_rnd = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train, y_train)

In [28]:
# Evaluate model
y_train_pred = clf_rnd.predict(X_train)
y_val_pred = clf_rnd.predict(X_val)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.9999911459762889
F1 score val: 0.5984588149536604


In [11]:
#Model selection
param_distribs = {
    'min_samples_split': [2, 5, 10],
    'n_estimators': randint(low=8, high=50),
    'max_depth': [None, 10, 20, 30],
}

clf_rnd_test = RandomForestClassifier()

rnd_search = RandomizedSearchCV(clf_rnd_test, param_distributions=param_distribs,
n_iter=8, cv=4, scoring='f1_weighted')
rnd_search.fit(X_train, y_train)

4 fits failed out of a total of 32.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/josecamacho/Desktop/Projects/datathon_baubap/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/josecamacho/Desktop/Projects/datathon_baubap/venv/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/josecamacho/Desktop/Projects/datathon_baubap/venv/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/josecamacho/Des

In [12]:
rnd_search.best_params_

{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 34}

In [14]:
#Chose the best model
clf_rnd_optimized = rnd_search.best_estimator_

In [35]:
clf_rnd_optimized = RandomForestClassifier(n_estimators=34, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1)
clf_rnd_optimized.fit(X_train, y_train)

In [36]:
# Evaluate model
y_train_pred = clf_rnd_optimized.predict(X_train)
y_val_pred = clf_rnd_optimized.predict(X_val)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.687891743052788
F1 score val: 0.6135838595410442


In [40]:
#Get the most important feactures
feature_importance = {name: score for name, score in zip(list(df), clf_rnd_optimized.feature_importances_)}
feature_importance_sorted = pd.Series(feature_importance).sort_values(ascending=False)
feature_importance_sorted

feature_122    0.163251
feature_199    0.040870
feature_198    0.038044
feature_171    0.029051
feature_200    0.023264
                 ...   
feature_41     0.000258
feature_102    0.000214
feature_110    0.000183
feature_35     0.000092
feature_115    0.000017
Length: 214, dtype: float64

In [41]:
#The most 15 important feacture
columns = list(feature_importance_sorted.head(30).index)
columns

['feature_122',
 'feature_199',
 'feature_198',
 'feature_171',
 'feature_200',
 'feature_68',
 'feature_167',
 'feature_69',
 'feature_143',
 'feature_70',
 'feature_141',
 'feature_72',
 'feature_146',
 'feature_79',
 'feature_34',
 'feature_142',
 'feature_208',
 'feature_78',
 'feature_33',
 'feature_210',
 'feature_217',
 'feature_214',
 'feature_191',
 'feature_58',
 'feature_53',
 'feature_22',
 'feature_6',
 'feature_164',
 'feature_37',
 'feature_16']