In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib

import warnings
warnings.filterwarnings("ignore")

In [65]:
df = pd.read_excel('https://github.com/MichaelJourdain93/Datathon_Passos_Magicos/raw/main/Datasets/dt_curated_passos_magicos.xlsx', engine='openpyxl')

In [66]:
df.head()

Unnamed: 0,Ano Letivo,RA,Fase,Turma,Nome,Idade,Gênero,Ano ingresso,Instituição de ensino,Pedra,...,Mat,Por,Ing,Fase Ideal,Defasagem,Destaque IEG,Destaque IDA,Destaque IPV,Atingiu PV,REC AV
0,2022,RA-1,7,A,Aluno-1,19,Feminino,2016,Escola Pública,Quartzo,...,2.7,3.5,6.0,Fase 8,-1,Melhorar entrega,Empenhar mais,Integra-se mais,Não,Manter atual com bolsa
1,2022,RA-112,4,A,Aluno-112,15,Feminino,2016,Rede Decisão,Ametista,...,9.0,7.5,9.1,Fase 5,-1,Boa entrega,Boas notas,Boa integração,Sim,Promovido fase
2,2022,RA-106,4,A,Aluno-106,15,Feminino,2016,Rede Decisão,Ametista,...,7.8,7.2,7.2,Fase 5,-1,Boa entrega,Empenhar mais,Boa integração,Sim,Promovido fase
3,2022,RA-105,4,A,Aluno-105,17,Masculino,2016,Rede Decisão,Agata,...,7.2,6.2,6.8,Fase 7,-3,Boa entrega,Empenhar mais,Integra-se mais,Não,Promovido fase
4,2022,RA-267,3,I,Aluno-267,15,Feminino,2016,Escola Pública,Agata,...,2.3,4.8,2.7,Fase 5,-2,Melhorar entrega,Empenhar mais,Integra-se mais,Não,Manter atual com bolsa


In [67]:
df_model = df[['Ano Letivo','Pedra','INDE','IAA','IEG','IPS','IDA','IPV','IAN','IPP','Idade']]

In [68]:
# 0 = Em risco; 1 = Não está em risco
df_model['RISCO'] = df_model['Pedra'].apply(lambda x: 1 if x != 'Quartzo' else 0).drop(columns=['Pedra'])

In [69]:
df_model_new = df_model.fillna(0)

In [70]:
df_model_new

Unnamed: 0,Ano Letivo,Pedra,INDE,IAA,IEG,IPS,IDA,IPV,IAN,IPP,Idade,RISCO
0,2022,Quartzo,5.8,8.3,4.1,5.6,4.0,7.3,5.0,8.3,19,0
1,2022,Ametista,7.8,7.9,7.8,6.3,8.5,8.8,5.0,8.6,15,1
2,2022,Ametista,8.0,9.2,8.0,9.4,7.4,8.4,5.0,8.8,15,1
3,2022,Agata,6.8,7.9,8.2,5.6,6.7,7.4,2.5,7.4,17,1
4,2022,Agata,6.0,10.0,7.4,5.0,3.3,6.3,5.0,6.0,15,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3025,2024,Ametista,7.6,9.0,8.2,7.5,7.8,7.5,5.0,7.5,9,1
3026,2024,Topázio,8.4,6.8,8.3,7.5,9.0,8.5,10.0,7.5,7,1
3027,2024,Topázio,8.2,7.4,8.5,7.5,9.0,7.4,10.0,7.5,7,1
3028,2024,Ametista,7.5,10.0,8.3,7.5,6.5,8.3,5.0,6.2,9,1


Ordem dos dados que devem ser passado na previsão
ANO
IAA
IEG
IPS
IDA
IPP
IPV
IAN

In [71]:
df_model_new['RISCO'].value_counts()

Unnamed: 0_level_0,count
RISCO,Unnamed: 1_level_1
1,2548
0,482


In [72]:
X = df_model_new.drop(columns=['RISCO','Pedra','Idade','INDE', 'Ano Letivo'])
y = df_model_new['RISCO']

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
smote = SMOTE(random_state=42)
X_train_resempled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [75]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [76]:
results = {}
for name, model in models.items():
    # Validação cruzada para avaliar desempenho
    cv_scores = cross_val_score(model, X_train_resempled, y_train_resampled, cv=5)
    model.fit(X_train_resempled, y_train_resampled)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1_Score = f1_score(y_test, y_pred)
    precision_Score = precision_score(y_test, y_pred)
    recall_Score = recall_score(y_test, y_pred)
    results[name] = {
        "Cross-Validation Score (Mean)": np.mean(cv_scores),
        "Acurácia": round(accuracy * 100,2),
        "f1 score": f1_Score,
        "precision score": precision_Score,
        "recall score" : recall_Score
    }

In [77]:
# Gradient Boosting
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
bst.fit(X_train_resempled, y_train_resampled)
preds = bst.predict(X_test)
accuracy_bst = accuracy_score(y_test, preds)
f1_Score_bst = f1_score(y_test, preds)
precision_Score_bst = precision_score(y_test, y_pred)
recall_Score_bst = recall_score(y_test, y_pred)

In [78]:
results['Gradient Boosting'] = {
    "Cross-Validation Score (Mean)": 0,
    "Acurácia": round(accuracy_bst * 100,2),
    "f1 score": f1_Score_bst,
    "precision score": precision_Score_bst,
    "recall score" : recall_Score_bst
}

In [79]:
results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Cross-Validation Score (Mean),Acurácia,f1 score,precision score,recall score
Random Forest,0.986835,97.36,0.983806,0.989817,0.977867
Support Vector Machine,0.986347,98.51,0.990863,1.0,0.981891
Logistic Regression,0.988054,99.17,0.994944,1.0,0.98994
Decision Tree,0.971479,94.55,0.966767,0.967742,0.965795
K-Nearest Neighbors,0.982202,96.53,0.97868,0.987705,0.969819
Gradient Boosting,0.0,88.78,0.927966,0.987705,0.969819


In [80]:
results_df.to_csv("model_passosmag.csv", sep=';')

In [81]:
# Melhor modelo baseado na métrica de validação cruzada
best_model_name = results_df["f1 score"].idxmax()
best_model = models[best_model_name]
print(f"\nMelhor modelo: {best_model_name}")


Melhor modelo: Logistic Regression


In [82]:
filename = f"../best_model_{best_model_name.replace(' ', '_').lower()}.pkl"
joblib.dump(best_model, filename)
print(f"Melhor modelo salvo como: {filename}")

Melhor modelo salvo como: ../best_model_logistic_regression.pkl


In [83]:
loaded_model = joblib.load(filename)

In [84]:
dados = {'IAA':2,
'IEG':3,
'IPS':4,
'IDA':5,
'IPP':6,
'IPV':7,
'IAN':1}

df_modelo = pd.DataFrame([dados])

In [85]:
# Reorder columns of df_modelo to match the training data (X_train_resempled)
df_modelo = df_modelo[X_train_resempled.columns] # Use the column order from X_train_resempled

pred = loaded_model.predict(df_modelo)


In [86]:
display(pred[0])

0