In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [41]:
df = pd.read_csv("dummy_classifier.csv")

In [42]:
#Configurando options do pandas para visualização
pd.set_option('display.max_columns', 500)

In [43]:
#Vamos comecar olhando descritivas da base, vamos notar que se trata de um dado complexo, com pouco contexto
#Esse tipo de dado é complexo para uma pessoa observar e encontrar padrões, por isso modelamos a informação
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,-0.003743,-0.001014,0.001999,-0.004118,0.003417,0.000189,-0.006234,-3.3e-05,-0.001269,-0.006136,0.49916
std,0.99737,0.999418,1.142076,1.232053,1.121812,0.999896,0.996697,1.003122,1.419729,1.000254,0.500002
min,-3.974071,-4.100143,-4.479927,-5.296495,-5.576663,-4.427324,-4.374059,-4.41353,-5.25938,-4.288019,0.0
25%,-0.67934,-0.675882,-0.845953,-1.005787,-0.820846,-0.673341,-0.68476,-0.675454,-0.982713,-0.683319,0.0
50%,9.7e-05,-0.000109,0.049964,-0.107243,0.199105,0.000445,-0.007576,-0.003183,0.053173,-0.008168,0.0
75%,0.667992,0.668436,0.843351,0.767612,0.847817,0.67489,0.66813,0.674857,1.114381,0.671837,1.0
max,4.173364,4.701476,5.048701,6.396706,4.013653,4.424013,5.487006,4.682031,5.178021,4.536913,1.0


In [45]:
#Baseado na descrição acima, percebemos que a coluna 10 é a unica que contém valores binários (0 ou 1)
#Sendo assim ela provavelmente representa nossa target - ou variável resposta
df["10"].value_counts()

0.0    50084
1.0    49916
Name: 10, dtype: int64

In [47]:
#Vamos dividir nossos dados em dois conjuntos: Treino e Test
#Esse processo é importante para validarmos o processo de modelagem: Devemos medir as métricas do modelo 
#treinado no conjunto de treino utilizando o conjunto de teste, pois são dados nunca vistos e simularia uma 
#aplicação na vida real
X_train, X_test, y_train, y_test = train_test_split(df.drop(["10"],axis=1), #Removemos a target do conjunto de Treino
                                                   df["10"],# Target
                                                   test_size = 0.2 #Costumamos usar regras simples, como 80% treino e 20% teste
                                                   )

In [48]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(80000, 10) (80000,) (20000, 10) (20000,)


In [49]:
#Modelo simples utilizando Regressão Logistica
lr = LogisticRegression()
lr.fit(X_train,y_train) #Treina o modelo

#Prediz a classe de cada amostra de acordo com o modelo treinado (lr) utilizando o conjunto de TESTE
test_performance = lr.predict(X_test)



In [50]:
lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
#Reporta métricas de Precision, Recall, F1-score e Acurácia
#A coluna support indica a quantidade de amostras em cada métrica
#É interessante observamos sempre as métricas da linha 1.0, pois temos interesse em classificar positivos.
print(classification_report(y_test, test_performance)) 

              precision    recall  f1-score   support

         0.0       0.79      0.73      0.76      9964
         1.0       0.75      0.80      0.78     10036

    accuracy                           0.77     20000
   macro avg       0.77      0.77      0.77     20000
weighted avg       0.77      0.77      0.77     20000



In [52]:
#Modelo de Árvore - A interface do sklearn permite com que os comandos sejam muito semelhantes entre os diferentes
#tipos de modelo.
tree = DecisionTreeClassifier()
tree.fit(X_train,y_train)

test_performance = tree.predict(X_test)

In [53]:
tree

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [54]:
print(classification_report(y_test, test_performance)) 

              precision    recall  f1-score   support

         0.0       0.69      0.70      0.70      9964
         1.0       0.70      0.69      0.70     10036

    accuracy                           0.70     20000
   macro avg       0.70      0.70      0.70     20000
weighted avg       0.70      0.70      0.70     20000



In [55]:
#Ensemble
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

test_performance = rf.predict(X_test)



In [56]:
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [57]:
print(classification_report(y_test, test_performance)) 

              precision    recall  f1-score   support

         0.0       0.77      0.82      0.79      9964
         1.0       0.81      0.76      0.78     10036

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000

