In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# LoadEnem process the dataset in the data folder
from load_enem import LoadEnem
# FullPipeline create a pipeline for numerical features than process 
# in a Column Transform together with the categorical feature.
from pipeline import FullPipeline
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import SGDClassifier

In [3]:
# Configuring Pandas
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

# LOADING DATA

In [4]:
# Loading Data
train_prep,test_prep = LoadEnem().load()
train_label = train_prep['IN_TREINEIRO'].copy()
train_label.fillna(0,inplace=True)
train = train_prep.drop('IN_TREINEIRO',axis=1)
train = train_prep[test_prep.columns].copy()

## 1 ) CHOOSING WHICH COLUMNS WILL BE PROCESS 

In [5]:
# Dividing categorical and numerical features from dataset
feat_cat = train.select_dtypes({'object'})
feat_num = train.select_dtypes({'int64','float64'})

## 1.1) CATEGORICAL FEATURES

In [6]:
# Finding the columns with more than 50% of missing values
drop_columns = list(feat_cat.loc[:,feat_cat.isnull().sum()/train.shape[0] > 0.5].columns)
# Decide to not process Q027 and Q028 for having more thatn 50% of missing values
featCat_clean = feat_cat.drop(drop_columns,axis=1).copy()
# Won't process TP_SEXO and SG_UF_RESIDENCIA for bias reason
cat_columns = list(featCat_clean.loc[:,featCat_clean.columns.str.startswith('Q0')].columns)
featCat_clean = featCat_clean[cat_columns]

## 1.2) NUMERICAL FEATURES 

In [32]:
feat_num

Unnamed: 0,CO_UF_RESIDENCIA,NU_IDADE,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,TP_DEPENDENCIA_ADM_ESC,IN_BAIXA_VISAO,IN_CEGUEIRA,IN_SURDEZ,IN_DISLEXIA,IN_DISCALCULIA,IN_SABATISTA,IN_GESTANTE,IN_IDOSO,TP_PRESENCA_CN,TP_PRESENCA_CH,TP_PRESENCA_LC,TP_PRESENCA_MT,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,TP_LINGUA,TP_STATUS_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO
0,43,24,1,1,1,4,1,,,0,0,0,0,0,0,0,0,1,1,1,1,436.3,495.4,581.2,1,1.0,120.0,120.0,120.0,80.0,80.0,520.0
1,23,17,3,1,2,0,2,1.0,2.0,0,0,0,0,0,0,0,0,1,1,1,1,474.5,544.1,599.0,1,1.0,140.0,120.0,120.0,120.0,80.0,580.0
2,23,21,3,1,3,0,1,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,1,,,,,,,
3,33,25,0,1,1,9,1,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,0,,,,,,,
4,13,28,2,1,1,4,1,,,0,0,0,0,0,0,0,0,0,0,0,0,,,,1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13725,35,20,1,1,1,3,1,,,0,0,0,0,0,0,0,0,1,1,1,1,438.2,419.7,394.5,0,1.0,120.0,120.0,80.0,80.0,40.0,440.0
13726,26,33,3,2,1,10,1,,,0,0,0,0,0,0,0,0,1,1,1,1,423.8,557.9,533.0,1,1.0,120.0,120.0,120.0,120.0,80.0,560.0
13727,43,15,1,1,3,0,1,,,0,0,0,0,0,0,0,0,1,1,1,1,460.5,528.9,569.3,0,1.0,120.0,120.0,120.0,120.0,120.0,600.0
13728,33,36,3,1,4,0,1,,,0,0,0,0,0,0,0,0,1,1,1,1,422.5,621.7,569.0,1,1.0,100.0,100.0,80.0,80.0,100.0,460.0


In [34]:
# This are the coluns which i choose to train my model. 
columns = ['NU_IDADE','TP_ST_CONCLUSAO','TP_ANO_CONCLUIU','TP_ESCOLA','TP_ENSINO',
           'TP_STATUS_REDACAO']
featNum_clean = feat_num[columns]

### 1.3) JOING FEATURES

In [35]:
train_clean = featNum_clean.join(featCat_clean)

## 2) APPLYING PIPELINE

In [39]:
# Pipeline in the train dataset
pipe = FullPipeline(train_clean,featNum_clean,featCat_clean)
train_final = pipe.full_pipeline()
# FillNA in label
train_label.fillna(0,inplace=True)

In [38]:
# Appling pipeline in the test dataset
pipe = FullPipeline(test_prep,featNum_clean,featCat_clean)
final_test = pipe.full_pipeline()

## 3) TRAINING 

### 3.1 BINARY CLASSIFIER - STOCHASTIC GRADIENT DESCENT (SGD)

In [45]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(train_final,train_label)

SGDClassifier(random_state=42)

In [62]:
sgd_answer = pd.DataFrame(test_prep['NU_INSCRICAO'],columns={'NU_INSCRICAO'})
sgd_answer['IN_TREINEIRO'] = sgd_clf.predict(final_test)

In [63]:
sgd_answer

Unnamed: 0,NU_INSCRICAO,IN_TREINEIRO
0,ba0cc30ba34e7a46764c09dfc38ed83d15828897,0
1,177f281c68fa032aedbd842a745da68490926cd2,0
2,6cf0d8b97597d7625cdedc7bdb6c0f052286c334,1
3,5c356d810fa57671402502cd0933e5601a2ebf1e,0
4,df47c07bd881c2db3f38c6048bf77c132ad0ceb3,0
...,...,...
4565,361b7fcd8867119550fe2af5aa729ffad89a7cf5,0
4566,d8a0e4c9e29494cc9bba2422bd79333931475ee1,0
4567,3f1c3388244df8d6521e983a809292d9f3bca643,0
4568,1778e9c4cef591beb6b986d191d15ed05de816b0,0


## SAVING THE ANSWER

In [1]:
SGD_answer.to_csv('sgd_answer.csv')

NameError: name 'SGD_answer' is not defined