In [1]:
# Bibliotecas
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

# Custom Libraries
from data import get_data, clean_data

# Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


# Metrics
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score



In [2]:
data = clean_data(get_data())

Removed 0.01888276947285602% contradictory rows and 8.300884533772024% duplicates


In [3]:
train_val_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)  

train_y = train_df['target']
train_x = train_df.drop(columns=['target'])

val_y = val_df['target']
val_x = val_df.drop(columns=['target'])

test_y = test_df['target']
test_x = test_df.drop(columns=['target'])

scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)
test_x = scaler.transform(test_x)

In [5]:
def print_metrics(y_true, y_pred):
    print('Precision: ', precision_score(y_true, y_pred) * 100, '%')
    print('Recall: ', recall_score(y_true, y_pred) * 100, '%')
    print('F1 Score: ', f1_score(y_true, y_pred) * 100, '%')
    print('Accuracy: ', accuracy_score(y_true, y_pred) * 100, '%')


def metrics(model, train_x = train_x, train_y=train_y, val_x = val_x, val_y = val_y, test_x = test_x, test_y = test_y, treshold=0.5):
    y_pred_train = model.predict(train_x)
    y_pred_val = model.predict(val_x)
    y_pred_test = model.predict(test_x)

    print('Train')
    print_metrics(train_y, y_pred_train > treshold)
    print('Val')
    print_metrics(val_y, y_pred_val > treshold)
    print('Test')
    print_metrics(test_y, y_pred_test >  treshold)


In [7]:
model = LogisticRegression()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  26.865671641791046 %
Recall:  0.7200720072007201 %
F1 Score:  1.402551865199182 %
Accuracy:  98.24497530344087 %
Val
Precision:  33.33333333333333 %
Recall:  0.7701786814540973 %
F1 Score:  1.5055706112616682 %
Accuracy:  98.29872000915394 %
Test
Precision:  27.450980392156865 %
Recall:  0.850546780072904 %
F1 Score:  1.6499705362404242 %
Accuracy:  98.26388166517569 %


In [8]:
model= DecisionTreeClassifier()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  100.0 %
Recall:  100.0 %
F1 Score:  100.0 %
Accuracy:  100.0 %
Val
Precision:  68.10551558752998 %
Recall:  69.99383857054838 %
F1 Score:  69.03676694013977 %
Accuracy:  98.94001570732367 %
Test
Precision:  69.33293087835798 %
Recall:  69.77521263669502 %
F1 Score:  69.55336866010599 %
Accuracy:  98.95406411883413 %


In [9]:
model = DecisionTreeClassifier(max_depth=9)
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  81.57298521954903 %
Recall:  75.61756175617562 %
F1 Score:  78.48245796138676 %
Accuracy:  99.2812079035924 %
Val
Precision:  78.42242503259452 %
Recall:  74.12199630314234 %
F1 Score:  76.211593284764 %
Accuracy:  99.21879469695789 %
Test
Precision:  79.23076923076923 %
Recall:  75.09113001215067 %
F1 Score:  77.10542732376794 %
Accuracy:  99.23648240996941 %


In [10]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  100.0 %
Recall:  99.97999799979999 %
F1 Score:  99.98999799959992 %
Accuracy:  99.99965325996314 %
Val
Precision:  81.13333333333334 %
Recall:  74.98459642637091 %
F1 Score:  77.93788024335575 %
Accuracy:  99.28328834381355 %
Test
Precision:  82.17821782178217 %
Recall:  75.63791008505468 %
F1 Score:  78.77254033533691 %
Accuracy:  99.3020159360892 %


In [11]:
model = SVC()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  73.95993836671802 %
Recall:  4.8004800480048 %
F1 Score:  9.015777610818933 %
Accuracy:  98.3203912614576 %
Val
Precision:  71.875 %
Recall:  4.959950708564387 %
F1 Score:  9.279538904899136 %
Accuracy:  98.36269354595433 %
Test
Precision:  73.38403041825094 %
Recall:  5.8626974483596594 %
F1 Score:  10.857946554149086 %
Accuracy:  98.35177980735223 %


In [13]:
# save svm model
import joblib
joblib.dump(model, 'models/svm_model.pkl')

['models/svm_model.pkl']

In [14]:
model = MultinomialNB()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  9.70873786407767 %
Recall:  0.2000200020002 %
F1 Score:  0.39196472317491426 %
Accuracy:  98.23769376266685 %
Val
Precision:  10.44776119402985 %
Recall:  0.21565003080714723 %
F1 Score:  0.42257772411711436 %
Accuracy:  98.28415692760588 %
Test
Precision:  8.333333333333332 %
Recall:  0.18226002430133656 %
F1 Score:  0.356718192627824 %
Accuracy:  98.25660016227349 %


In [15]:
model = KNeighborsClassifier()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  77.44397531666125 %
Recall:  47.6947694769477 %
F1 Score:  59.03323636813765 %
Accuracy:  98.8524638480169 %
Val
Precision:  58.7093389296957 %
Recall:  34.473197781885396 %
F1 Score:  43.43944099378882 %
Accuracy:  98.48439929889165 %
Test
Precision:  60.536980749746704 %
Recall:  36.30012150668287 %
F1 Score:  45.38549183440942 %
Accuracy:  98.50417126094825 %


In [16]:
model = AdaBoostClassifier()
model.fit(train_x, train_y)
metrics(model)



Train
Precision:  69.03988566974147 %
Recall:  53.145314531453145 %
F1 Score:  60.05877034358047 %
Accuracy:  98.77462070974218 %
Val
Precision:  67.425431711146 %
Recall:  52.92667898952557 %
F1 Score:  59.302726958923024 %
Accuracy:  98.77358048963161 %
Test
Precision:  68.2824427480916 %
Recall:  54.34386391251519 %
F1 Score:  60.52097428958051 %
Accuracy:  98.7860694447334 %


In [17]:
model = BaggingClassifier()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  99.3744787322769 %
Recall:  95.32953295329533 %
F1 Score:  97.3099892807922 %
Accuracy:  99.9086340002878 %
Val
Precision:  81.2804453723034 %
Recall:  71.96549599507087 %
F1 Score:  76.33986928104575 %
Accuracy:  99.24688063994341 %
Test
Precision:  82.65554396941258 %
Recall:  72.23572296476306 %
F1 Score:  77.09515318528123 %
Accuracy:  99.26508831422805 %


In [18]:
model = ExtraTreesClassifier()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  100.0 %
Recall:  100.0 %
F1 Score:  100.0 %
Accuracy:  100.0 %
Val
Precision:  78.42948717948718 %
Recall:  75.38508934072705 %
F1 Score:  76.87715991203268 %
Accuracy:  99.23439799861652 %
Test
Precision:  78.96913776646515 %
Recall:  75.39489671931956 %
F1 Score:  77.14063714063714 %
Accuracy:  99.23492208791895 %


In [19]:
model = GradientBoostingClassifier()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  79.8944647856989 %
Recall:  74.1974197419742 %
F1 Score:  76.94062743064558 %
Accuracy:  99.2290235280452 %
Val
Precision:  77.80327868852459 %
Recall:  73.10536044362293 %
F1 Score:  75.38119440914866 %
Accuracy:  99.19382941430406 %
Test
Precision:  78.8831835686778 %
Recall:  74.66585662211422 %
F1 Score:  76.71660424469412 %
Accuracy:  99.22399983356564 %


In [20]:
model = XGBClassifier()
model.fit(train_x, train_y)
metrics(model)

Train
Precision:  86.51747940325095 %
Recall:  77.71777177717772 %
F1 Score:  81.88188188188188 %
Accuracy:  99.40378050662187 %
Val
Precision:  80.99117447386287 %
Recall:  73.50585335797905 %
F1 Score:  77.0671834625323 %
Accuracy:  99.26144372149147 %
Test
Precision:  81.90571049136787 %
Recall:  74.93924665856622 %
F1 Score:  78.26776649746193 %
Accuracy:  99.28745293028481 %
