In [None]:
# Importing The Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, precision_score, precision_recall_fscore_support, roc_auc_score, log_loss, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Reading The Dataset
data = pd.read_csv(r'F:\brightfuture\ml_models\dropout_prediction\dropout_dataset_final.csv')

columns = ['Marital status', 'Application mode', 'Application order', 'Course', 'Previous qualification', 'Nacionality', 'Mother's qualification', 'Father's qualification', 'Mother's occupation',
           'Father's occupation', 'Displaced', 'Educational special needs', 'Debtor', 'Gender', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)',
           'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (without evaluations)',
           'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (without evaluations)',
           'Unemployment rate', 'Inflation rate', 'GDP', 'Scholarship holder']

data.drop(columns, axis=1, inplace=True)

# Data Preprocessing
data_filtered = data[data['Target'] != 'Enrolled']
data_no_outliers = data_filtered.copy()
data_no_outliers['Target'] = data_no_outliers['Target'].map({'Dropout': 0, 'Graduate': 1})

# Model Selection and Training
X = data_no_outliers.drop('Target', axis=1)
y = data_no_outliers['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, "scaler.joblib")

# Random Forest
rf = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 150, 200], 'max_depth': [5, 10, 20, 30]}
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf = RandomForestClassifier(n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'])
best_rf.fit(X_train, y_train)

# Predicting the model
y_pred = best_rf.predict(X_test)

# Printing the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

# Saving the model
joblib.dump(best_rf, "dropout_model.joblib")
