In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, balanced_accuracy_score

# Load dataset
df = pd.read_csv("student_dropout.csv")

# Encode categorical variables
label_encoders = {}
for column in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Exclude 'Dropped_Out', 'Final_Grade', 'Grade_1', and 'Grade_2' from features (X)
X = df.drop(["Dropped_Out", "Final_Grade", "Grade_1", "Grade_2"], axis=1)  # Drop the specified columns from the feature set
y = df["Dropped_Out"]  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42, class_weight='balanced'), param_grid, cv=5)
grid_search.fit(X_resampled, y_resampled)

# Best parameters from GridSearchCV
print("Best parameters found: ", grid_search.best_params_)

# Train the model with the best parameters
dtree = grid_search.best_estimator_
dtree.fit(X_resampled, y_resampled)

# Get feature importances from the decision tree model
feature_importances = dtree.feature_importances_

# Create a DataFrame with features and their importance scores
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by the importance score in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the top important features
print("Top 10 Important Features:")
print(importance_df.head(10))

# Evaluate the model using the test set
y_pred = dtree.predict(X_test)

# Print classification report for detailed evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate and print balanced accuracy score
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"\nBalanced Accuracy: {balanced_accuracy}")


Best parameters found:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Top 10 Important Features:
                        Feature  Importance
20       Wants_Higher_Education    0.214881
0                        School    0.104122
3                       Address    0.053598
14           Number_of_Failures    0.051273
29           Number_of_Absences    0.050689
2                           Age    0.048365
10   Reason_for_Choosing_School    0.048029
26  Weekend_Alcohol_Consumption    0.046675
19             Attended_Nursery    0.045654
11                     Guardian    0.043416

Classification Report:
              precision    recall  f1-score   support

       False       0.90      0.83      0.86       169
        True       0.26      0.38      0.31        26

    accuracy                           0.77       195
   macro avg       0.58      0.61      0.58       195
weighted avg       0.81      0.77      0.79       195


Balanced Accuracy: 0.606508875739645
