### IMPORT

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from load_enem import LoadEnem
from pipeline import FullPipeline
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

# LOADING DATA

In [3]:
# Loading Data
train_prep,test_prep = LoadEnem().load()
train_label = train_prep['NU_NOTA_MT'].copy()
train_label.fillna(0,inplace=True)
train = train_prep.drop('NU_NOTA_MT',axis=1)
train = train_prep[test_prep.columns].copy()

## 1 ) CHOOSING WITCH COLUMNS WILL BE PROCESS 

In [4]:
# Dividing categorical and numerical features from dataset
feat_cat = train.select_dtypes({'object'})
feat_num = train.select_dtypes({'int64','float64'})

### 1.1) CATEGORICAL FEATURES

In [5]:
# It was decided to choose only the Q00* columns.
# finding the columns with more than x% of missing values
drop_columns = list(feat_cat.loc[:,feat_cat.isnull().sum()/train.shape[0] > 0.5].columns)
# Decide to not process Q027 and Q028 for having more thatn 50% of missing values
featCat_clean = feat_cat.drop(drop_columns,axis=1).copy()
cat_columns = list(featCat_clean.loc[:,featCat_clean.columns.str.startswith('Q0')].columns)
featCat_clean = featCat_clean[cat_columns]

### 1.2) NUMERICAL FEATURES

In [6]:
# It was decided to choose only the TP_PRESENCA,NU_NOTA,Q0* columns.
presenca = list(feat_num.loc[:,feat_num.columns.str.startswith('TP_PRESENCA')].columns)
nota = list(feat_num.loc[:,feat_num.columns.str.startswith('NU_NOTA')].columns)
questao = list(feat_num.loc[:,feat_num.columns.str.startswith('Q0')].columns)
columns = presenca+nota+questao
feat_num = feat_num[columns]

In [7]:
# finding the columns with more than x% of missing values
drop_columns = list(feat_num.loc[:,feat_num.isnull().sum()/train.shape[0] > .5].columns)
# Droping Columns
featNum_clean = feat_num.drop(drop_columns,axis=1)

### 1.3) JOING FEATURES

In [8]:
train_clean = featNum_clean.join(featCat_clean)

## 2) APPLYING PIPELINE

In [9]:
# Pipeline in the train dataset
pipe = FullPipeline(train_clean,featNum_clean,featCat_clean)
train_final = pipe.full_pipeline()
# FillNA in label
train_label.fillna(0,inplace=True)

In [10]:
# Appling pipeline in the test dataset
pipe = FullPipeline(test_prep,featNum_clean,featCat_clean)
final_test = pipe.full_pipeline()

## 3) TRAINING 

In [11]:
# Applying the DecisionTreeRegressor in the final_test
tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_final,train_label)
predictions_tree = pd.DataFrame(tree_reg.predict(final_test),columns={'NU_NOTA_MT'})

Unnamed: 0,NU_NOTA_MT
0,414.4
1,475.4
2,709.3
3,0.0
4,649.9
...,...
4571,494.9
4572,412.2
4573,606.6
4574,575.4


In [12]:
forest_reg = RandomForestRegressor()
forest_reg.fit(train_final,train_label)
predictions_forest = pd.DataFrame(forest_reg.predict(final_test),columns={'NU_NOTA_MT'})

## 3) SAVING THE ANSWER

In [14]:
answer_tree = pd.DataFrame(test_prep['NU_INSCRICAO'])
answer_tree = answer_tree.join(predictions_tree)
answer_tree.to_csv('answer_tree.csv')

In [15]:
answer_forest = pd.DataFrame(test_prep['NU_INSCRICAO'])
answer_forest = answer_forest.join(pd.DataFrame(predictions_forest,columns={'NU_NOTA_MT'}))
answer_forest.to_csv('answer_forest.csv')