Feature engineering

In [34]:
#Import libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from scipy.stats import randint

from src.utils.split_data import train_val_split, split_features

In [2]:
#Load the data
DATA_PATH = "../../data/processed/"

df = pd.read_csv(DATA_PATH+"processed_data.csv")

In [3]:
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_87,feature_118,feature_119,feature_139,feature_144,feature_147,feature_158,feature_159,feature_195,target
0,0.661364,1.0,0.0,0.000120,0.310606,0.009740,0.180457,0.752510,0.001050,0.500000,...,2,1,0,0,3,3,5,11,4,0
1,0.672847,0.5,0.0,0.000131,0.327720,0.035065,0.685714,0.777475,0.003627,0.000000,...,0,0,0,0,1,1,6,6,2,0
2,0.728351,0.5,0.0,0.000068,0.245778,0.001299,0.571429,0.712884,0.000604,0.000000,...,1,1,1,4,3,4,6,11,4,0
3,0.575738,0.5,0.0,0.000000,0.259794,0.011688,0.228571,0.624854,0.000986,0.000000,...,1,1,1,1,2,5,0,5,5,0
4,0.505637,0.0,0.0,0.000049,0.332217,0.001948,0.380914,0.847067,0.001845,0.666667,...,1,1,5,7,3,2,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132927,0.532182,1.0,0.0,0.000031,0.203233,0.001299,0.000000,0.757268,0.000000,0.500000,...,0,0,2,1,1,5,3,6,2,1
132928,0.607332,1.0,0.0,0.000106,0.404031,0.023377,0.285714,0.873888,0.002322,0.000000,...,2,0,5,0,1,1,5,6,4,1
132929,0.677294,0.0,0.0,0.000000,0.200944,0.001948,0.190514,0.740053,0.001559,0.500000,...,0,0,3,7,3,4,0,13,2,1
132930,0.607917,1.0,0.0,0.001009,0.278486,0.012987,0.380914,0.754901,0.001145,1.000000,...,0,2,4,7,3,0,1,7,1,1


In [4]:
#Split the train,and validation set
train_set, val_set = train_val_split(df)

In [5]:
X_train, y_train = split_features(train_set, "target")
X_val, y_val = split_features(val_set, "target")

In [27]:
#A random forest is trained for the selection of characteristics
clf_rnd = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train, y_train)

In [28]:
# Evaluate model
y_train_pred = clf_rnd.predict(X_train)
y_val_pred = clf_rnd.predict(X_val)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.9999911459762889
F1 score val: 0.5984588149536604


In [11]:
#Model selection
param_distribs = {
    'min_samples_split': [2, 5, 10],
    'n_estimators': randint(low=8, high=50),
    'max_depth': [None, 10, 20, 30],
}

clf_rnd_test = RandomForestClassifier()

rnd_search = RandomizedSearchCV(clf_rnd_test, param_distributions=param_distribs,
n_iter=8, cv=4, scoring='f1_weighted')
rnd_search.fit(X_train, y_train)

4 fits failed out of a total of 32.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/josecamacho/Desktop/Projects/datathon_baubap/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/josecamacho/Desktop/Projects/datathon_baubap/venv/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/josecamacho/Desktop/Projects/datathon_baubap/venv/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/josecamacho/Des

In [12]:
rnd_search.best_params_

{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 34}

In [14]:
#Chose the best model
clf_rnd_optimized = rnd_search.best_estimator_

In [6]:
clf_rnd_optimized = RandomForestClassifier(n_estimators=34, max_depth=10, min_samples_split=5, random_state=42, n_jobs=-1)
clf_rnd_optimized.fit(X_train, y_train)

In [7]:
# Evaluate model
y_train_pred = clf_rnd_optimized.predict(X_train)
y_val_pred = clf_rnd_optimized.predict(X_val)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.687891743052788
F1 score val: 0.6135838595410442


In [8]:
#Get the most important feactures
feature_importance = {name: score for name, score in zip(list(df), clf_rnd_optimized.feature_importances_)}
feature_importance_sorted = pd.Series(feature_importance).sort_values(ascending=False)
feature_importance_sorted

feature_122    0.163251
feature_199    0.040870
feature_198    0.038044
feature_171    0.029051
feature_200    0.023264
                 ...   
feature_41     0.000258
feature_102    0.000214
feature_110    0.000183
feature_35     0.000092
feature_115    0.000017
Length: 214, dtype: float64

In [9]:
#The most 50 important feacture
columns = list(feature_importance_sorted.head(50).index)
columns

['feature_122',
 'feature_199',
 'feature_198',
 'feature_171',
 'feature_200',
 'feature_68',
 'feature_167',
 'feature_69',
 'feature_143',
 'feature_70',
 'feature_141',
 'feature_72',
 'feature_146',
 'feature_79',
 'feature_34',
 'feature_142',
 'feature_208',
 'feature_78',
 'feature_33',
 'feature_210',
 'feature_217',
 'feature_214',
 'feature_191',
 'feature_58',
 'feature_53',
 'feature_22',
 'feature_6',
 'feature_164',
 'feature_37',
 'feature_16',
 'feature_215',
 'feature_160',
 'feature_21',
 'feature_51',
 'feature_172',
 'feature_52',
 'feature_50',
 'feature_187',
 'feature_73',
 'feature_48',
 'feature_206',
 'feature_44',
 'feature_128',
 'feature_175',
 'feature_24',
 'feature_8',
 'feature_26',
 'feature_40',
 'feature_170',
 'feature_202']

In [10]:
#Reduce dataset to the most important feactures
X_train_reduced = X_train[columns].copy()
X_val_reduced = X_val[columns].copy()

In [11]:
X_train_reduced

Unnamed: 0,feature_122,feature_199,feature_198,feature_171,feature_200,feature_68,feature_167,feature_69,feature_143,feature_70,...,feature_206,feature_44,feature_128,feature_175,feature_24,feature_8,feature_26,feature_40,feature_170,feature_202
74162,0.848317,0.954067,0.939043,0.000000,0.952651,0.022073,0.099188,0.025375,0.002364,0.956522,...,0.070730,1.0,0.2215,0.370730,0.3200,0.727612,0.055983,0.176018,0.496934,0.643334
23928,0.860013,0.897436,0.857143,0.095238,0.897436,0.023992,0.043231,0.027682,0.000000,0.960000,...,0.000000,0.0,0.7797,0.416139,0.2024,0.773332,0.096571,0.050605,0.358278,0.642390
52526,0.894873,0.897436,0.857143,0.000000,0.897436,0.035509,0.111079,0.042676,0.002364,1.000000,...,0.000000,0.0,0.0964,0.002461,0.0864,0.695802,0.075577,0.024202,0.001951,0.642390
96286,0.777969,0.897436,0.857143,0.095238,0.897436,0.024952,0.066732,0.023068,0.018913,0.769231,...,0.023511,0.0,0.1795,0.000000,0.2400,0.634308,0.104969,0.082508,0.000000,0.639257
128858,0.937397,0.948718,0.935671,0.142857,0.947010,0.117083,0.098076,0.114187,0.007092,0.811475,...,0.247605,0.0,0.1904,0.004415,0.2200,0.771397,0.214136,0.067107,0.006092,0.650236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,0.926747,0.954036,0.933214,0.095238,0.954036,0.036468,0.128104,0.042676,0.000000,0.973684,...,0.030846,0.0,0.0524,0.003098,0.2112,0.885799,0.083975,0.079208,0.025429,0.643044
119879,0.879609,0.953067,0.934629,0.047619,0.953067,0.063340,0.143389,0.057670,0.004728,0.757576,...,0.313143,0.0,0.0790,0.000824,0.3104,0.848689,0.041987,0.099010,0.004346,0.642528
103694,0.887663,0.897436,0.857143,0.095238,0.897436,0.039347,0.197140,0.042676,0.007092,0.902439,...,0.000000,0.0,0.1400,0.048946,0.2520,0.570190,0.044087,0.069307,0.108922,0.642390
131932,0.936765,0.954554,0.936700,0.047619,0.954554,0.023992,0.168032,0.027682,0.000000,0.960000,...,0.104641,1.0,0.0661,0.778283,0.1896,0.774389,0.332400,0.064906,0.878663,0.642528


In [12]:
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

In [None]:
# #Train the model
# rbf_kernel_svm_clf = SVC(kernel="rbf", gamma=0.05, C=1000)
# rbf_kernel_svm_clf.fit(X_train_reduced, y_train)

In [None]:
# # Evaluate model
# y_train_pred = rbf_kernel_svm_clf.predict(X_train_reduced)
# y_val_pred = rbf_kernel_svm_clf.predict(X_val_reduced)
#
# print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
# print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

In [14]:
#Define model and train
nb_clf = BernoulliNB(alpha=1.0e-10)
nb_clf.fit(X_train_reduced, y_train)

In [15]:
# Evaluate model
y_train_pred = nb_clf.predict(X_train_reduced)
y_val_pred = nb_clf.predict(X_val_reduced)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.5466918582274938
F1 score val: 0.5423019431988043


In [32]:
clf_rnd_wr = RandomForestClassifier(n_estimators=45, max_depth=12, min_samples_split=8, random_state=42, n_jobs=-1)
clf_rnd_wr.fit(X_train_reduced, y_train)

In [33]:
# Evaluate model
y_train_pred = clf_rnd_wr.predict(X_train_reduced)
y_val_pred = clf_rnd_wr.predict(X_val_reduced)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.7042996335031732
F1 score val: 0.6250189998480011


In [35]:
X_shuffled, y_shuffled = shuffle(X_train_reduced, y_train, random_state=42)

In [36]:
clf_rnd_wr = RandomForestClassifier(n_estimators=45, max_depth=12, min_samples_split=8, random_state=42, n_jobs=-1)
clf_rnd_wr.fit(X_shuffled, y_shuffled)

In [38]:
# Evaluate model
y_train_pred = clf_rnd_wr.predict(X_shuffled)
y_val_pred = clf_rnd_wr.predict(X_val_reduced)

print("F1 score train:", f1_score(y_train_pred, y_shuffled, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.7070918746585708
F1 score val: 0.6205527561459765


In [39]:
X = df.drop("target", axis=1)
y = df["target"].copy()