In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV

In [2]:
# Leitura do dataset
df = pd.read_csv('C:\\Users\\Luiz Guilherme\\codenation\\enem-2\\train.CSV')

In [3]:
# Eliminando colunas não presentes no test do treino
df_test = pd.read_csv('C:\\Users\\Luiz Guilherme\\codenation\\enem-2\\test.CSV')
colunas = list(df_test.columns)
colunas.append('NU_NOTA_MT')
df=df[colunas]

In [4]:
# Dropando colunas object desnecessárias
df = df.drop(['CO_PROVA_CN','CO_PROVA_CH','CO_PROVA_LC','CO_PROVA_MT','NU_INSCRICAO','SG_UF_RESIDENCIA'],axis=1)

# Dropando colunas com mais de 4000 valores faltantes
df = df.drop(df.columns[df.isna().sum()>4000],axis=1)

# Inputando 0 nos valores faltantes de notas
df=df.fillna(0)

In [5]:
# Encoding
df['TP_SEXO'] = df['TP_SEXO'].replace({'M':0, 'F':1})
colunas = ['Q001','Q002','Q006','Q024','Q025','Q026','Q047']
encoder = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8,'I':9,
           'J':10,'K':11,'L':12,'M':13,'N':14,'O':15,'P':16,'Q':17}
df[colunas] = df[colunas].replace(encoder)

In [6]:
df['LOG_NOTA_MT'] = np.log(df['NU_NOTA_MT']+0.1)
df['NU_NOTA_CN'] = np.log(df['NU_NOTA_CN']+0.1)
df['NU_NOTA_CH'] = np.log(df['NU_NOTA_CH']+0.1)
df['NU_NOTA_LC'] = np.log(df['NU_NOTA_LC']+0.1)
df['NU_NOTA_REDACAO'] = np.log(df['NU_NOTA_REDACAO']+0.1)

In [7]:
X = df.drop(['NU_NOTA_MT','LOG_NOTA_MT'],axis=1)
y = df['LOG_NOTA_MT']
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)
X.columns = df.drop(['NU_NOTA_MT','LOG_NOTA_MT'],axis=1).columns

In [8]:
pca = PCA(n_components = 0.95)
n = pca.fit(X).n_components_

In [9]:
pca = PCA(n_components=n)
pca.fit(X).explained_variance_ratio_

array([0.30520739, 0.07947584, 0.07082119, 0.03631478, 0.0342917 ,
       0.03242346, 0.02834709, 0.02775822, 0.02743816, 0.02711615,
       0.02708373, 0.02700832, 0.02684652, 0.02649308, 0.0260626 ,
       0.02470781, 0.02370245, 0.02282034, 0.02055218, 0.01796548,
       0.01422516, 0.01319847, 0.01234556])

In [10]:
rfe = RFE(RidgeCV(),n_features_to_select=n, step=1)
z = list(zip(X.columns,rfe.fit(X,y).support_))
features = []
for i in range(0,len(z)):
    if z[i][1] == True:
        features.append(z[i][0])
np.array(features)

array(['NU_IDADE', 'TP_SEXO', 'IN_DISLEXIA', 'IN_DISCALCULIA',
       'IN_SABATISTA', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH',
       'TP_PRESENCA_LC', 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC',
       'TP_LINGUA', 'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP3',
       'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q002',
       'Q006', 'Q024', 'Q026', 'Q047'], dtype='<U17')

In [11]:
X = X[features]

In [12]:
alphas = [6]
ridge_cv = RidgeCV(alphas=alphas,cv=5).fit(X,y)
y_pred = ridge_cv.predict(X)
y_exp = np.exp(y_pred)-0.1
r2_score(df['NU_NOTA_MT'].values, y_exp)

0.9042329560392024

In [13]:
ridge_cv.alpha_

6

In [14]:
ridge_cv.best_score_

0.9973204986116941

In [15]:
df_test = pd.read_csv('C:\\Users\\Luiz Guilherme\\codenation\\enem-2\\test.CSV')
inscricao = df_test['NU_INSCRICAO']
df_test = df_test[X.columns]
df_test=df_test.fillna(0)

In [16]:
df_test['TP_SEXO'] = df_test['TP_SEXO'].replace({'M':0, 'F':1})
colunas = ['Q002','Q006','Q024','Q026','Q047']
encoder = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8,'I':9,
           'J':10,'K':11,'L':12,'M':13,'N':14,'O':15,'P':16,'Q':17}
df_test[colunas] = df_test[colunas].replace(encoder)
df_test['NU_NOTA_CN'] = np.log(df_test['NU_NOTA_CN']+0.1)
df_test['NU_NOTA_CH'] = np.log(df_test['NU_NOTA_CH']+0.1)
df_test['NU_NOTA_LC'] = np.log(df_test['NU_NOTA_LC']+0.1)
df_test['NU_NOTA_REDACAO'] = np.log(df_test['NU_NOTA_REDACAO']+0.1)
df_test = scaler.fit_transform(df_test)

In [17]:
y_pred = ridge_cv.predict(df_test)
y_exp = np.exp(y_pred)-0.1

In [18]:
answer = pd.DataFrame({'NU_INSCRICAO':inscricao,'NU_NOTA_MT':y_exp.round(1)})

In [19]:
answer.to_csv('answer.csv', index=False)

In [20]:
pd.read_csv('C:\\Users\\Luiz Guilherme\\codenation\\enem-2\\answer.csv')

Unnamed: 0,NU_INSCRICAO,NU_NOTA_MT
0,73ff9fcc02f0a99919906c942c2e1a1042cdcf98,390.4
1,71a95f9f1b91a82c65ad94abbdf9f54e6066f968,483.4
2,b38a03232f43b11c9d0788abaf060f7366053b6d,566.8
3,70b682d9a3636be23f6120fa9d6b164eb3c6002d,-0.0
4,715494628a50142ce8cb17191cfe6d0f3cae0934,555.1
...,...,...
4571,dac0f22429c7f8e3931d0abaf5dfc8e5c772a48b,451.4
4572,a75fa8770257e7c9368d059fe53d9ef431f4bdef,444.7
4573,655fa6306720ff16e825903b5422a46608a77545,596.0
4574,1f4bc3e3d56212d500625bfe8ac78ccff4362293,508.8
