In [1]:
import pandas as pd
import warnings
import pickle

warnings.filterwarnings('ignore')

In [2]:
with open('../data/processed/data.pkl', 'rb') as f:
    data = pickle.load(f)

target = data['target']
manual_features = data['manual_features']
standard_features = data['standard_features']
minmax_features = data['minmax_features']

## Variables

- `target` = The result aimed by model
- `manual_features` = Set of manual predict variables without scalling
- `standard_features` = Set of scalled with padronization predict variables
- `minmax_features` = Set of scalled with normalization predict variables

## Algorithms

> Separation between train and test data

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
RANDOM = 5
x_train, x_test, y_train, y_test = train_test_split(minmax_features, target, test_size = 0.25, random_state=RANDOM)

#

### Naive Bayes

#### Complement Naive Bayes

In [5]:
from sklearn.naive_bayes import ComplementNB

cnb = ComplementNB()

In [6]:
cnb.fit(x_train, y_train)

In [7]:
predict_cnb = cnb.predict(x_test)
accuracy_score(y_test, predict_cnb)

0.8601398601398601

In [8]:
model = cnb.predict(x_train)
accuracy_score(y_train, model)

0.8661971830985915

#

#### Gaussian Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

In [10]:
gnb.fit(x_train, y_train)

In [11]:
predict_gnb = gnb.predict(x_test)
accuracy_score(y_test, predict_gnb)

0.9370629370629371

In [12]:
model = gnb.predict(x_train)
accuracy_score(y_train, model)

0.931924882629108

#

### SVM

In [13]:
from sklearn.svm import LinearSVC

lsvc = LinearSVC(penalty='l1',tol=0.0005, C=1.977, random_state=RANDOM)

In [14]:
lsvc.fit(x_train, y_train)

In [15]:
predict_lsvc = lsvc.predict(x_test)
accuracy_score(y_test, predict_lsvc)

0.986013986013986

In [16]:
model = lsvc.predict(x_train)
accuracy_score(y_train, model)

0.9859154929577465

In [17]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [18]:
kfold = KFold(n_splits = 50, shuffle=True, random_state = RANDOM)
model = LinearSVC(penalty='l1',tol=0.0005, C=1.977, random_state=RANDOM)

resultado = cross_val_score(model, minmax_features, target, cv = kfold)

print(f'Acuracia Media: {round((resultado.mean() * 100),2)}%')

Acuracia Media: 96.79%


#

## RandomForest

In [19]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=40, criterion='log_loss', max_depth=3, random_state=RANDOM)

In [20]:
rfc.fit(x_train, y_train)

In [21]:
predict_rfc = rfc.predict(x_test)
accuracy_score(y_test, predict_rfc)

0.972027972027972

In [22]:
model = rfc.predict(x_train)
accuracy_score(y_train, model)

0.9765258215962441

### XGBoost

In [23]:
from xgboost import XGBClassifier

xgb = XGBClassifier(max_depth=3, learning_rate=0.3, n_estimators=500, objective='binary:hinge', random_state=RANDOM, alpha=12)

In [24]:
xgb.fit(x_train, y_train)

In [25]:
predict_xgb = xgb.predict(x_test)
accuracy_score(y_test, predict_xgb)

0.986013986013986

In [26]:
model = xgb.predict(x_train)
accuracy_score(y_train, model)

0.9812206572769953

### Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2',
                        tol=0.0006, C=1.95,
                        random_state=RANDOM, solver='liblinear', 
                        max_iter=200)


In [28]:
lr.fit(x_train, y_train)

In [29]:
predict_lr = lr.predict(x_test)
accuracy_score(y_test, predict_lr)

0.9790209790209791

In [30]:
model = lr.predict(x_train)
accuracy_score(y_train, model)

0.9671361502347418

#

- LinearSVC = 98.60% Test | 98.59% Train | `minmax_features` | **LinearSVC(penalty='l1',tol=0.0005, C=1.977, random_state=RANDOM)**
- GaussianNB = 95.10% Test | 93.19% Train | `manual_features` | **GaussianNB()**
- RandomForest = 97.20% Test | 97.65% Train | `manual_features` | **RandomForestClassifier(n_estimators=40, criterion='log_loss', max_depth=3, random_state=RANDOM)**
- XGBoost = 98.60% Test | 98.12% Train | `minmax_features` | **XGBClassifier(max_depth=3, learning_rate=0.3, n_estimators=500, objective='binary:hinge', random_state=RANDOM, alpha=12)**

#

## Chosen Algorithm - SVM -> LinearSVC

#

- LinearSVC = 98.60% Test | 98.59% Train | `minmax_features` | **LinearSVC(penalty='l1',tol=0.0005, C=1.977, random_state=RANDOM)**

#

In [31]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

params_svc = {
    'penalty': ['l1', 'l2'],
    'tol': [1e-3, 1e-4, 1e-5],
    'C': [0.1, 0.0005, 0.25, 0.002, 0.03, 0.01, 1, 10, 100, 500, 1.000, 1.977],
    'max_iter': [100, 200, 500, 1000, 2000, 2500]
}

grid_search = GridSearchCV(lsvc, params_svc, scoring='roc_auc', cv=4)
grid_search.fit(x_train, y_train)

In [32]:
result = pd.DataFrame(grid_search.cv_results_)
result.sort_values(by='mean_test_score', ascending=False, inplace=True)
result.iloc[:, 13:]

Unnamed: 0,mean_test_score,std_test_score,rank_test_score
375,0.994821,0.003431,1
363,0.994821,0.003431,1
365,0.994821,0.003431,1
369,0.994821,0.003431,1
370,0.994821,0.003431,1
...,...,...,...
180,0.500000,0.000000,379
181,0.500000,0.000000,379
182,0.500000,0.000000,379
186,0.500000,0.000000,379


In [33]:
import joblib

joblib.dump(lsvc, '../model/breast-guard-model.pkl')

['../model/breast-guard-model.pkl']