In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [56]:
def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):
    ax = ax or plt.gca()
    
    # Plot the training points
    ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,
               clim=(y.min(), y.max()), zorder=3)
    ax.axis('tight')
    ax.axis('off')
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    # Predict
    xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
                         np.linspace(*ylim, num=200))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    # Create a color plot with the results
    n_classes = len(np.unique(y))
    contours = ax.contourf(xx, yy, Z, alpha=0.3,
                           levels=np.arange(n_classes + 1) - 0.5,
                           cmap=cmap, #clim=(y.min(), y.max()),
                           zorder=1)

    ax.set(xlim=xlim, ylim=ylim)

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,NU_INSCRICAO,NU_ANO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,...,Q041,Q042,Q043,Q044,Q045,Q046,Q047,Q048,Q049,Q050
0,1,ed50e8aaa58e7a806c337585efee9ca41f1eb1ad,2016,4314902,Porto Alegre,43,RS,24,M,0.0,...,5.0,A,A,A,A,A,A,A,B,D
1,2,2c3acac4b33ec2b195d77e7c04a2d75727fad723,2016,2304707,Granja,23,CE,17,F,0.0,...,,A,A,C,A,B,A,A,C,A
2,3,f4545f8ccb9ff5c8aad7d32951b3f251a26e6568,2016,2304400,Fortaleza,23,CE,21,F,0.0,...,,A,A,A,A,C,A,A,B,A
3,4,3d6ec248fef899c414e77f82d5c6d2bffbeaf7fe,2016,3304557,Rio de Janeiro,33,RJ,25,F,0.0,...,5.0,C,A,A,A,A,D,A,A,A
4,5,bf896ac8d3ecadd6dba1dfbf50110afcbf5d3268,2016,1302603,Manaus,13,AM,28,M,0.0,...,,A,A,A,A,A,A,A,A,A


In [5]:
df_test.head()

Unnamed: 0,NU_INSCRICAO,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,...,NU_NOTA_COMP5,NU_NOTA_REDACAO,Q001,Q002,Q006,Q024,Q025,Q026,Q027,Q047
0,ba0cc30ba34e7a46764c09dfc38ed83d15828897,43,RS,19,F,1,1,1,3,1,...,,,E,E,H,B,B,C,D,A
1,177f281c68fa032aedbd842a745da68490926cd2,15,PA,24,M,3,2,1,4,1,...,,,B,C,B,A,A,C,D,A
2,6cf0d8b97597d7625cdedc7bdb6c0f052286c334,29,BA,16,F,2,1,3,0,1,...,,,E,F,G,B,B,A,,D
3,5c356d810fa57671402502cd0933e5601a2ebf1e,41,PR,17,F,1,1,2,0,2,...,40.0,480.0,E,E,E,C,B,B,C,A
4,df47c07bd881c2db3f38c6048bf77c132ad0ceb3,53,DF,19,F,1,1,1,1,1,...,80.0,720.0,E,E,E,B,B,B,D,A


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13730 entries, 0 to 13729
Columns: 167 entries, Unnamed: 0 to Q050
dtypes: float64(28), int64(79), object(60)
memory usage: 17.5+ MB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4570 entries, 0 to 4569
Data columns (total 43 columns):
NU_INSCRICAO              4570 non-null object
CO_UF_RESIDENCIA          4570 non-null int64
SG_UF_RESIDENCIA          4570 non-null object
NU_IDADE                  4570 non-null int64
TP_SEXO                   4570 non-null object
TP_COR_RACA               4570 non-null int64
TP_NACIONALIDADE          4570 non-null int64
TP_ST_CONCLUSAO           4570 non-null int64
TP_ANO_CONCLUIU           4570 non-null int64
TP_ESCOLA                 4570 non-null int64
TP_ENSINO                 1426 non-null float64
TP_DEPENDENCIA_ADM_ESC    1426 non-null float64
IN_BAIXA_VISAO            4570 non-null int64
IN_CEGUEIRA               4570 non-null int64
IN_SURDEZ                 4570 non-null int64
IN_DISLEXIA               4570 non-null int64
IN_DISCALCULIA            4570 non-null int64
IN_SABATISTA              4570 non-null int64
IN_GESTANTE               4570 non-null int64
IN_IDOSO    

In [8]:

df_enem = df_train.copy()
df_enem = df_enem[['NU_INSCRICAO','CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO',
       'TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO', 'TP_DEPENDENCIA_ADM_ESC',
       'IN_BAIXA_VISAO', 'IN_CEGUEIRA', 'IN_SURDEZ', 'IN_DISLEXIA',
       'IN_DISCALCULIA', 'IN_SABATISTA', 'IN_GESTANTE', 'IN_IDOSO',
       'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC', 'TP_PRESENCA_MT',
       'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'TP_LINGUA',
       'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3',
       'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q001', 'Q002',
       'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047','IN_TREINEIRO']]

In [69]:
df_enem.corr()['IN_TREINEIRO']

#Features com correlações relevantes: TP_ST_CONCLUSAO, NU_IDADE

CO_UF_RESIDENCIA         -0.034944
NU_IDADE                 -0.295091
TP_COR_RACA               0.009676
TP_NACIONALIDADE         -0.015179
TP_ST_CONCLUSAO           0.533983
TP_ANO_CONCLUIU          -0.257710
TP_ESCOLA                -0.244562
TP_ENSINO                      NaN
TP_DEPENDENCIA_ADM_ESC         NaN
IN_BAIXA_VISAO           -0.013602
IN_CEGUEIRA                    NaN
IN_SURDEZ                -0.007374
IN_DISLEXIA              -0.003297
IN_DISCALCULIA           -0.003297
IN_SABATISTA              0.003824
IN_GESTANTE              -0.011893
IN_IDOSO                 -0.004663
TP_PRESENCA_CN            0.094692
TP_PRESENCA_CH            0.094692
TP_PRESENCA_LC            0.092454
TP_PRESENCA_MT            0.092454
NU_NOTA_CN               -0.037874
NU_NOTA_CH               -0.053460
NU_NOTA_LC               -0.028261
TP_LINGUA                -0.036395
TP_STATUS_REDACAO         0.006688
NU_NOTA_COMP1            -0.008709
NU_NOTA_COMP2            -0.023308
NU_NOTA_COMP3       

In [10]:
df_enem[['TP_ST_CONCLUSAO', 'NU_IDADE']].isna().sum()

TP_ST_CONCLUSAO    0
NU_IDADE           0
dtype: int64

In [70]:
# Padronizando as idades
df_enem['NU_IDADE_standardized'] = (df_enem['NU_IDADE'] - df_enem['NU_IDADE'].mean())/df_enem['NU_IDADE'].std() 
df_test['NU_IDADE_standardized'] = (df_test['NU_IDADE'] - df_test['NU_IDADE'].mean())/df_test['NU_IDADE'].std() 

In [12]:
X = df_enem[['TP_ST_CONCLUSAO', 'NU_IDADE_standardized']]
y = df_enem['IN_TREINEIRO']

In [71]:
# Separando o dataframe de treino para fit e para o predict 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Regressão Logística

In [72]:
# Treinando modelo de regressão logística
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
y_predict = logistic_regression.predict(X_test)l

In [37]:
# verificando métricas
print("Accuracy:",metrics.accuracy_score(y_test, y_predict))
print("Precision:",metrics.precision_score(y_test, y_predict))
print("Recall:",metrics.recall_score(y_test, y_predict))

Accuracy: 0.9895135450043694
Precision: 0.9433551198257081
Recall: 0.9774266365688488


### KNeighborsClassifier

In [54]:
knn = KNeighborsClassifier(n_neighbors=7)jppm

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [62]:
y_predict_knn = knn.predict(X_test)

In [63]:
# verificando métricas
print("Accuracy:",metrics.accuracy_score(y_test, y_predict_knn))
print("Precision:",metrics.precision_score(y_test, y_predict_knn))
print("Recall:",metrics.recall_score(y_test, y_predict_knn))

Accuracy: 0.997087095834547
Precision: 1.0
Recall: 0.9774266365688488


#### Observamos o melhor desempenho do modelo KNeighborsClassifier

### Aplicando o modelo nos csv de teste

In [73]:
X_test_csv = df_test[['TP_ST_CONCLUSAO' ,'NU_IDADE_standardized','NU_INSCRICAO']]
X_test_csv.head()

Unnamed: 0,TP_ST_CONCLUSAO,NU_IDADE_standardized,NU_INSCRICAO
0,1,-0.374815,ba0cc30ba34e7a46764c09dfc38ed83d15828897
1,1,0.361475,177f281c68fa032aedbd842a745da68490926cd2
2,3,-0.81659,6cf0d8b97597d7625cdedc7bdb6c0f052286c334
3,2,-0.669332,5c356d810fa57671402502cd0933e5601a2ebf1e
4,1,-0.374815,df47c07bd881c2db3f38c6048bf77c132ad0ceb3


In [77]:
# Fazendo treinamento para teste csv
y_predict_test_csv = knn.predict(X_test_csv[['TP_ST_CONCLUSAO', 'NU_IDADE_standardized']])
list(y_predict_test_csv)

[0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,


In [78]:
# Criando csv de resposta
answer = pd.DataFrame({'NU_INSCRICAO':X_test_csv['NU_INSCRICAO'],
             'IN_TREINEIRO':y_predict_test_csv})
answer

Unnamed: 0,NU_INSCRICAO,IN_TREINEIRO
0,ba0cc30ba34e7a46764c09dfc38ed83d15828897,0
1,177f281c68fa032aedbd842a745da68490926cd2,0
2,6cf0d8b97597d7625cdedc7bdb6c0f052286c334,1
3,5c356d810fa57671402502cd0933e5601a2ebf1e,0
4,df47c07bd881c2db3f38c6048bf77c132ad0ceb3,0
5,3f28749fb79fb059caf5aed79625a5addfd7a91a,0
6,bb2a0edddf3c59181a1496390aaaee7f32624d9d,1
7,cc7cab347fe5455aae983f3701ca40f84dc01949,0
8,95e9338f1da02f7bfa0e3194130afdccc0fb5457,1
9,155f84f2ee5b34e658f2adcc70f2ec83e37040cb,0


In [79]:
answer.to_csv('answer.csv', index=False)