In [None]:
# Bibliotecas
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

# Custom Libraries
from data import get_data, clean_data

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


# Models
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb


# Metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score



In [2]:
data = clean_data(get_data())

Removed 0.01888276947285602% contradictory rows and 8.300884533772024% duplicates


In [42]:
train_val_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)  



In [43]:
train_y = train_df['target']
train_x = train_df.drop(columns=['target'])

val_y = val_df['target']
val_x = val_df.drop(columns=['target'])

test_y = test_df['target']
test_x = test_df.drop(columns=['target'])


scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)
test_x = scaler.transform(test_x)

In [64]:
def print_metrics(y_true, y_pred):
    print('Precision: ', precision_score(y_true, y_pred) * 100, '%')
    print('Recall: ', recall_score(y_true, y_pred) * 100, '%')
    print('F1 Score: ', f1_score(y_true, y_pred) * 100, '%')
    print('Accuracy: ', accuracy_score(y_true, y_pred) * 100, '%')


def metrics(model, train_x = train_x, train_y=train_y, val_x = val_x, val_y = val_y, test_x = test_x, test_y = test_y, treshold=0.5):
    y_pred_train = model.predict(train_x)
    y_pred_val = model.predict(val_x)
    y_pred_test = model.predict(test_x)

    print_metrics(train_y, y_pred_train > treshold)
    print_metrics(val_y, y_pred_val > treshold)
    print_metrics(test_y, y_pred_test >  treshold)


In [48]:
model = LinearRegression()
model.fit(train_x, train_y)

In [None]:
metrics(model)

Precision:  10.989010989010989 %
Recall:  0.1000100010001 %
F1 Score:  0.19821605550049554 %
Accuracy:  98.25416391441763 %
Precision:  3.125 %
Recall:  0.030807147258163897 %
F1 Score:  0.06101281269066504 %
Accuracy:  98.2961194588775 %
Precision:  25.0 %
Recall:  0.212636695018226 %
F1 Score:  0.42168674698795183 %
Accuracy:  98.28052510038073 %


In [51]:
model_tree = tree.DecisionTreeClassifier()
model_tree.fit(train_x, train_y)

In [None]:
metrics(model)

Precision:  10.989010989010989 %
Recall:  0.1000100010001 %
F1 Score:  0.19821605550049554 %
Accuracy:  98.25416391441763 %
Precision:  3.125 %
Recall:  0.030807147258163897 %
F1 Score:  0.06101281269066504 %
Accuracy:  98.2961194588775 %
Precision:  25.0 %
Recall:  0.212636695018226 %
F1 Score:  0.42168674698795183 %
Accuracy:  98.28052510038073 %


In [56]:
model = tree.DecisionTreeClassifier(max_depth=9)
model.fit(train_x, train_y)

In [None]:
metrics(model)

Precision:  81.5661740912523 %
Recall:  75.62756275627562 %
F1 Score:  78.48469122989103 %
Accuracy:  99.2812079035924 %
Precision:  78.35723598435462 %
Recall:  74.06038200862601 %
F1 Score:  76.14824200190054 %
Accuracy:  99.21671425673672 %
Precision:  79.2948717948718 %
Recall:  75.15188335358445 %
F1 Score:  77.16781035558327 %
Accuracy:  99.23856283937005 %


In [58]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(train_x, train_y)

In [None]:
metrics(model)

Precision:  100.0 %
Recall:  99.95999599959995 %
F1 Score:  99.97999399819946 %
Accuracy:  99.99930651992628 %
Precision:  81.15218115218116 %
Recall:  75.07701786814542 %
F1 Score:  77.99647943670988 %
Accuracy:  99.28484867397941 %
Precision:  82.31627296587926 %
Recall:  76.21506682867559 %
F1 Score:  79.14826498422714 %
Accuracy:  99.31241808309235 %


In [None]:
model = lgb.LGBMClassifier()
model.fit(train_x, train_y)
metrics(model)

[LightGBM] [Info] Number of positive: 9999, number of negative: 566802
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1257
[LightGBM] [Info] Number of data points in the train set: 576801, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.017335 -> initscore=-4.037525
[LightGBM] [Info] Start training from score -4.037525
Precision:  79.15131094029694 %
Recall:  75.17751775177518 %
F1 Score:  77.11325400082069 %
Accuracy:  99.22642297776876 %
Precision:  75.58359621451103 %
Recall:  73.8139248305607 %
F1 Score:  74.68827930174564 %
Accuracy:  99.15534127021279 %
Precision:  77.0068450528936 %
Recall:  75.18226002430134 %
F1 Score:  76.08361512450045 %
Accuracy:  99.19071296315559 %


In [None]:
model = xgb.XGBClassifier()
model.fit(train_x, train_y)
metrics(model)

Precision:  86.51747940325095 %
Recall:  77.71777177717772 %
F1 Score:  81.88188188188188 %
Accuracy:  99.40378050662187 %
Precision:  80.99117447386287 %
Recall:  73.50585335797905 %
F1 Score:  77.0671834625323 %
Accuracy:  99.26144372149147 %
Precision:  81.90571049136787 %
Recall:  74.93924665856622 %
F1 Score:  78.26776649746193 %
Accuracy:  99.28745293028481 %
