# Student Dropout Prediction System
## Machine Learning Model Development


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Load the dataset
df = pd.read_csv('dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names:\n{df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())
print(f"\nTarget distribution:")
print(df['Target'].value_counts())


In [None]:
# Feature Engineering
# Calculate attendance percentage
df['Attendance_1st_sem'] = df['Curricular units 1st sem (approved)'] / df['Curricular units 1st sem (enrolled)'].replace(0, 1) * 100
df['Attendance_2nd_sem'] = df['Curricular units 2nd sem (approved)'] / df['Curricular units 2nd sem (enrolled)'].replace(0, 1) * 100
df['Overall_Attendance'] = (df['Attendance_1st_sem'] + df['Attendance_2nd_sem']) / 2
df['Overall_Attendance'] = df['Overall_Attendance'].fillna(0)

# Calculate backlog (failed courses)
df['Backlog_1st_sem'] = df['Curricular units 1st sem (enrolled)'] - df['Curricular units 1st sem (approved)']
df['Backlog_2nd_sem'] = df['Curricular units 2nd sem (enrolled)'] - df['Curricular units 2nd sem (approved)']
df['Total_Backlog'] = df['Backlog_1st_sem'] + df['Backlog_2nd_sem']

# Average grade
df['Avg_Grade_1st'] = df['Curricular units 1st sem (grade)'].fillna(0)
df['Avg_Grade_2nd'] = df['Curricular units 2nd sem (grade)'].fillna(0)
df['Overall_Grade'] = (df['Avg_Grade_1st'] + df['Avg_Grade_2nd']) / 2
df['Overall_Grade'] = df['Overall_Grade'].fillna(0)

# Total enrolled courses
df['Total_Enrolled'] = df['Curricular units 1st sem (enrolled)'] + df['Curricular units 2nd sem (enrolled)']

print("Feature engineering completed!")


In [None]:
# Select relevant features for prediction
features = [
    'Age at enrollment',
    'Course',  # Degree
    'Tuition fees up to date',  # Fees status
    'Debtor',  # Family income indicator
    'Scholarship holder',
    'Gender',
    'Previous qualification',
    'Overall_Attendance',
    'Total_Backlog',
    'Overall_Grade',
    'Total_Enrolled',
    'Daytime/evening attendance',
    'Displaced',
    'Educational special needs',
    'International',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 2nd sem (enrolled)'
]

# Prepare features and target
X = df[features].copy()
y = df['Target'].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")


In [None]:
# Encode target variable (Dropout = 1, others = 0 for binary classification)
# We'll create a binary target: 1 for Dropout, 0 for others
y_binary = (y == 'Dropout').astype(int)

print(f"Dropout rate: {y_binary.mean() * 100:.2f}%")
print(f"\nTarget distribution:")
print(y_binary.value_counts())


In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


In [None]:
# Train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
print("Model trained successfully!")


In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probability of dropout

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("Feature Importance:")
print(feature_importance)


In [None]:
# Save the model and feature list
joblib.dump(rf_model, 'dropout_model.pkl')
joblib.dump(features, 'model_features.pkl')

print("Model saved successfully!")
print(f"\nModel file: dropout_model.pkl")
print(f"Features file: model_features.pkl")


In [None]:
# Test prediction example
sample_data = X_test.iloc[0:1]
prediction = rf_model.predict_proba(sample_data)[0][1] * 100

print(f"Sample prediction:")
print(f"Dropout probability: {prediction:.2f}%")
print(f"\nSample data:")
print(sample_data)
