In [9]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt

In [12]:
# Loading data
df = Path("../Resources/Student performance (Polytechnic Institute of Portalegre).csv")


In [13]:
performance_df = pd.read_csv(df)
performance_df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [None]:
# Define the mapping
mapping = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
# Apply the mapping to the 'Target' column
performance_df['Target'] = performance_df['Target'].map(mapping)
# Convert the 'Target' column to an array
target_array = performance_df['Target'].values
print(target_array)

In [None]:
# Define features set
X = performance_df.copy()
X.drop("Target", axis=1, inplace=True)
X.head()

In [None]:
# Define target vector
y= performance_df["Target"].ravel()
y[:5]

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)

In [None]:
print(y_train)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Dropout", "Actual Enrolled","Actual Graduate"], 
    columns=["Predicted Dropout", "Predicted Enrolled","Predicted Graduate"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

In [None]:
# Visualize the features by importance
importances_df = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances_df.set_index(importances_df[1], inplace=True)
importances_df.drop(columns=1, inplace=True)
importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances_df.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)
plt.tight_layout()
plt.savefig('Figures/Features_Importances.PNG')
plt.show()