## Bibliotecas

In [1]:
import pandas
import matplotlib.pyplot as plot
import seaborn as sns
import numpy
import sklearn

## Lendo os Dados

In [2]:
dataset = pandas.read_csv('https://raw.githubusercontent.com/alura-cursos/imersao-dados-2-2020/master/MICRODADOS_ENEM_2019_SAMPLE_43278.csv')

### Variáveis importantes

In [3]:
tests = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_MT', 'NU_NOTA_LC','NU_NOTA_REDACAO']

dataset['NU_NOTA_TOTAL'] = dataset[tests].sum(axis=1)

students_without_zero_score = dataset.query('NU_NOTA_TOTAL != 0')

## Modelo de Previsão 

In [4]:
# Varíaveis funcionais ou independentes (x)
input_tests = ['NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC','NU_NOTA_REDACAO']

# Varíavel dependente (y)
output_test = 'NU_NOTA_MT'

# Removendo notas NaN
students_without_zero_score = students_without_zero_score[tests].dropna()

input_scores = students_without_zero_score[input_tests]
output_score = students_without_zero_score[output_test]

input_scores, output_score

(        NU_NOTA_CN  NU_NOTA_CH  NU_NOTA_LC  NU_NOTA_REDACAO
 0            435.6       512.3       488.6            420.0
 2            423.2       499.1       441.0            560.0
 3            426.2       578.1       551.5            500.0
 4            516.5       571.3       511.2            780.0
 5            559.6       618.7       607.5            900.0
 ...            ...         ...         ...              ...
 127373       480.4       501.5       485.3            520.0
 127375       502.4       559.6       525.7            740.0
 127376       449.1       380.9       450.3            520.0
 127378       564.7       416.2       517.3            620.0
 127379       466.7       425.5       477.1            480.0
 
 [92537 rows x 4 columns], 0         432.4
 2         427.2
 3         499.9
 4         424.5
 5         615.6
           ...  
 127373    386.7
 127375    539.7
 127376    418.2
 127378    455.7
 127379    438.5
 Name: NU_NOTA_MT, Length: 92537, dtype: float64)

In [5]:
# Nomenclatura padrão
x = input_scores
y = output_score

In [6]:
from sklearn.model_selection import train_test_split

# Número aleatório para selecionar sempre os mesmos dados
SEED = 1313

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.25)


print(f"Notas no total: {len(x)} | Notas de treino: {len(x_train)} | Notas de teste: {len(x_test)}")

Notas no total: 92537 | Notas de treino: 69402 | Notas de teste: 23135


In [7]:
from sklearn.svm import LinearSVR

model = LinearSVR(random_state = SEED)

model.fit(x_train, y_train)
math_predictions = model.predict(x_test)





In [8]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, math_predictions)

7225.609540617956

## Insights da Aula

### Predição com Decision Tree

In [9]:
from sklearn.tree import DecisionTreeRegressor


x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.25)


tree_model = DecisionTreeRegressor(max_depth = 3)
tree_model.fit(x_train, y_train)
tree_model_math_predictions = tree_model.predict(x_test)
mean_squared_error(y_test, tree_model_math_predictions)

6028.130241332906

In [10]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

number_of_splits = KFold(n_splits=10, shuffle=True)

tree_model = DecisionTreeRegressor(max_depth = 5)

result = cross_validate(tree_model, x, y, cv=number_of_splits, scoring='neg_mean_squared_error') # retorno negativo, pois quanto maior, melhor

absolute_results = result['test_score'] * -1

score_mean = absolute_results.mean()
print(score_mean)

5680.679906954913


### Desvio padrão e Intervalo de Confiança do Model

In [11]:
from scipy import stats

def print_confidence_interval(results):
  absolute_results = results['test_score'] * -1
  score_mean = absolute_results.mean()
  results_standard_deviation = absolute_results.std()

  lower_limit, upper_limit = stats.norm.interval(0.95, loc=score_mean, scale=results_standard_deviation)

  print(f'Intervalo de confiança obtido no modelo: {lower_limit} - {upper_limit}')

In [12]:
print_confidence_interval(result)

Intervalo de confiança obtido no modelo: 5496.263192928652 - 5865.096620981175
