In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from scipy.stats import randint

from src.utils.split_data import train_val_split, split_features

In [2]:
#Load the data
DATA_PATH = "../../data/processed/"

train_data = pd.read_csv(DATA_PATH+"processed_data.csv")
test_data = pd.read_csv(DATA_PATH+"processed_validation_data.csv")

In [6]:
X_train, y_train = split_features(train_data, "target")
X_val, y_val = split_features(test_data, "target")

In [7]:
clf_rnd = RandomForestClassifier(n_estimators=45, max_depth=12, min_samples_split=8, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train, y_train)

In [18]:
# Evaluate model
y_train_pred = clf_rnd.predict(X_train)
y_val_pred = clf_rnd.predict(X_val)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.7403505015904086
F1 score val: 0.7115824718896135


In [19]:
validation_set, test_set = train_val_split(test_data)

In [22]:
X_test, y_test = split_features(test_set, "target")

In [23]:
# Evaluate model
y_train_pred = clf_rnd.predict(X_train)
y_test_pred = clf_rnd.predict(X_test)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_test_pred, y_test, pos_label=1))

F1 score train: 0.7403505015904086
F1 score val: 0.7114700784446868


In [9]:
#Get the most important feactures
feature_importance = {name: score for name, score in zip(list(train_data), clf_rnd.feature_importances_)}
feature_importance_sorted = pd.Series(feature_importance).sort_values(ascending=False)
feature_importance_sorted

feature_122    0.133662
feature_199    0.029595
feature_198    0.028962
feature_171    0.020812
feature_200    0.020511
                 ...   
feature_192    0.000358
feature_41     0.000356
feature_89     0.000201
feature_115    0.000106
feature_35     0.000039
Length: 214, dtype: float64

In [10]:
#The most 50 important features
columns = list(feature_importance_sorted.head(50).index)
columns

['feature_122',
 'feature_199',
 'feature_198',
 'feature_171',
 'feature_200',
 'feature_167',
 'feature_68',
 'feature_69',
 'feature_72',
 'feature_141',
 'feature_34',
 'feature_70',
 'feature_143',
 'feature_53',
 'feature_79',
 'feature_142',
 'feature_146',
 'feature_208',
 'feature_78',
 'feature_214',
 'feature_33',
 'feature_164',
 'feature_187',
 'feature_210',
 'feature_37',
 'feature_22',
 'feature_51',
 'feature_217',
 'feature_128',
 'feature_52',
 'feature_8',
 'feature_160',
 'feature_21',
 'feature_215',
 'feature_191',
 'feature_24',
 'feature_1',
 'feature_6',
 'feature_85',
 'feature_40',
 'feature_50',
 'feature_5',
 'feature_73',
 'feature_205',
 'feature_188',
 'feature_193',
 'feature_83',
 'feature_135',
 'feature_16',
 'feature_86']

In [11]:
#Reduce dataset to the most important features
X_train_reduced = X_train[columns].copy()
X_val_reduced = X_val[columns].copy()

In [12]:
clf_rnd_wr = RandomForestClassifier(n_estimators=45, max_depth=12, min_samples_split=8, random_state=42, n_jobs=-1)
clf_rnd_wr.fit(X_train_reduced, y_train)

In [13]:
# Evaluate model
y_train_pred = clf_rnd_wr.predict(X_train_reduced)
y_val_pred = clf_rnd_wr.predict(X_val_reduced)

print("F1 score train:", f1_score(y_train_pred, y_train, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_val, pos_label=1))

F1 score train: 0.6907319453232803
F1 score val: 0.7099926067369116


In [16]:
clf_rnd_test = RandomForestClassifier(n_estimators=45, max_depth=12, min_samples_split=8, random_state=42, n_jobs=-1)
clf_rnd_test.fit(X_val, y_val)

In [17]:
# Evaluate model
y_train_pred = clf_rnd_test.predict(X_val)
y_val_pred = clf_rnd_test.predict(X_train)

print("F1 score train:", f1_score(y_train_pred, y_val, pos_label=1))
print("F1 score val:", f1_score(y_val_pred, y_train, pos_label=1))

F1 score train: 0.9199289568578359
F1 score val: 0.6667034460621807
