### Hipótese escolhida (1)
Estudantes com Acesso_Internet = Ruim/Instável apresentam maior risco de evasão (Risco_Evasao = 1), considerando também Faltas_Percentual, Trabalho_Horas_Semanais e Deslocamento_Minutos.

Objetivo: treinar um KNN Classifier para `Risco_Evasao`, comparar k × distâncias (euclidiana/manhattan), avaliar desempenho e analisar matriz de confusão por estratos de `Acesso_Internet`. Autor: Luigi Garotti. Curso: 2º semestre.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

# Caminho do CSV
CSV_PATH = 'dataset_educacao_graduacao_brasil_500.csv'

# Leitura
df = pd.read_csv(CSV_PATH)
print('Formato:', df.shape)
print('\nInfo:')
df.info()
print('\nHead:')
display(df.head())
print('\nNulos:')
print(df.isna().sum().sort_values(ascending=False))

# Target e features
TARGET = 'Risco_Evasao'
base_features_num = [
    'Faltas_Percentual',
    'Trabalho_Horas_Semanais',
    'Deslocamento_Minutos',
    'Horas_Estudo_Semanais',
    'Nota_ENEM'
]
base_features_cat = ['Acesso_Internet','Modalidade','Tipo_IES']

features_num = [c for c in base_features_num if c in df.columns]
features_cat = [c for c in base_features_cat if c in df.columns]

# Drop target nulo e imput simples
df_model = df.dropna(subset=[TARGET]).copy()
num_impute_values = df_model[features_num].median(numeric_only=True)
for col in features_num:
    df_model[col] = df_model[col].fillna(num_impute_values[col])
for col in features_cat:
    df_model[col] = df_model[col].fillna('Desconhecido')

X = df_model[features_num + features_cat]
y = df_model[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

preprocess = ColumnTransformer([
    ('num', StandardScaler(), features_num),
    ('cat', OneHotEncoder(handle_unknown='ignore'), features_cat)
])

# Busca k e métricas
k_values = [3,5,11,21]
metrics = ['euclidean','manhattan']
results = []
for metric in metrics:
    train_scores, test_scores = [], []
    for k in k_values:
        model = Pipeline([
            ('prep', preprocess),
            ('knn', KNeighborsClassifier(n_neighbors=k, metric=metric))
        ])
        model.fit(X_train, y_train)
        train_scores.append(model.score(X_train, y_train))
        test_scores.append(model.score(X_test, y_test))
        results.append({'metric': metric, 'k': k,
                        'acc_train': train_scores[-1], 'acc_test': test_scores[-1]})
    plt.figure(figsize=(6,4))
    plt.plot(k_values, train_scores, marker='o', label='Treino')
    plt.plot(k_values, test_scores, marker='s', label='Teste')
    plt.title(f'Acuracia vs k ({metric})')
    plt.xlabel('k'); plt.ylabel('Acuracia'); plt.ylim(0,1); plt.xticks(k_values)
    plt.grid(True, alpha=0.3); plt.legend(); plt.show()

results_df = pd.DataFrame(results).sort_values(['metric','k'])
display(results_df)

# Melhor config
best = results_df.sort_values('acc_test', ascending=False).iloc[0]
best_k, best_metric = int(best.k), best.metric
print(f'Melhor: k={best_k}, metric={best_metric}, acc_test={best.acc_test:.3f}')

best_model = Pipeline([
    ('prep', preprocess),
    ('knn', KNeighborsClassifier(n_neighbors=best_k, metric=best_metric))
])

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
print('Acuracia (teste):', round(accuracy_score(y_test, y_pred), 4))
cm = confusion_matrix(y_test, y_pred)
print('Matriz de confusao:\n', cm)
print('\nRelatorio:\n', classification_report(y_test, y_pred, digits=3))

# Estratos por Acesso_Internet
if 'Acesso_Internet' in X_test.columns:
    for cat in X_test['Acesso_Internet'].unique():
        mask = (X_test['Acesso_Internet'] == cat)
        if mask.sum() == 0:
            continue
        cm_g = confusion_matrix(y_test[mask], y_pred[mask])
        print(f'\nEstrato: {cat} | n={mask.sum()}')
        print(cm_g)
        ConfusionMatrixDisplay(cm_g).plot(colorbar=False)
        plt.title(f'Matriz de confusao - {cat}')
        plt.show()
else:
    print('Sem coluna Acesso_Internet em X_test')
