In [1]:
import pandas as pd

# Load cleaned dataset
df = pd.read_csv("cleaned_student_data.csv")

# Display first few rows
df.head()


Unnamed: 0,Target_num,Curricular units 1st sem (grade),Admission grade,Previous qualification (grade),Age at enrollment
0,0,0.0,127.3,122.0,20
1,2,14.0,142.5,160.0,19
2,0,0.0,124.8,122.0,19
3,2,13.428571,119.6,122.0,20
4,2,12.333333,141.5,102.5,34


In [2]:
from sklearn.model_selection import train_test_split

# Features and target
X = df.drop('Target_num', axis=1)
y = df['Target_num']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append((name, acc, f1))

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "F1 Score"])
results_df.sort_values(by="Accuracy", ascending=False, inplace=True)
results_df


Unnamed: 0,Model,Accuracy,F1 Score
0,Logistic Regression,0.625,0.55642
3,SVM,0.611446,0.529938
2,Random Forest,0.593373,0.575322
4,KNN,0.568524,0.555733
1,Decision Tree,0.520331,0.522845


In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# Define model parameters for tuning
param_grids = {
    "Logistic Regression": {
        'C': [0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear']
    },
    "Decision Tree": {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    },
    "Random Forest": {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 15, None]
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly']
    },
    "KNN": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }
}

best_models = []
for name, model in models.items():
    print(f"üîç Tuning {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    best_score = grid.best_score_
    print(f"‚úÖ Best params for {name}: {grid.best_params_}")
    print(f"CV Accuracy: {best_score:.4f}\n")
    
    best_models.append((name, best_model, best_score))

# Display summary of best models
best_df = pd.DataFrame(best_models, columns=["Model", "Best Model", "CV Accuracy"])
best_df.sort_values(by="CV Accuracy", ascending=False, inplace=True)
best_df


üîç Tuning Logistic Regression...
‚úÖ Best params for Logistic Regression: {'C': 0.1, 'solver': 'lbfgs'}
CV Accuracy: 0.6237

üîç Tuning Decision Tree...
‚úÖ Best params for Decision Tree: {'max_depth': 5, 'min_samples_split': 10}
CV Accuracy: 0.6356

üîç Tuning Random Forest...
‚úÖ Best params for Random Forest: {'max_depth': 10, 'n_estimators': 100}
CV Accuracy: 0.6434

üîç Tuning SVM...


In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# Separate features and target
X = df_cleaned.drop('Target_num', axis=1)
y = df_cleaned['Target_num']

# Show original class distribution
print("Original class distribution:", Counter(y))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Show new balanced class distribution
print("Balanced class distribution:", Counter(y_resampled))


In [None]:
# ==========================
# üì¶ MODEL TRAINING PIPELINE WITH SMOTE
# ==========================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

# ==========================
# 1Ô∏è‚É£ Load cleaned dataset
# ==========================
df = pd.read_csv("cleaned_student_data.csv")

# Separate features and target
X = df.drop("Target_num", axis=1)
y = df["Target_num"]

# ==========================
# 2Ô∏è‚É£ Split dataset
# ==========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# ==========================
# 3Ô∏è‚É£ Apply SMOTE
# ==========================
print("Before SMOTE:", Counter(y_train))
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_train_resampled))

# ==========================
# 4Ô∏è‚É£ Standardize features
# ==========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# ==========================
# 5Ô∏è‚É£ Define models
# ==========================
models = {
    "üå≤ Random Forest": RandomForestClassifier(random_state=42),
    "üìâ Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "üå≥ Decision Tree": DecisionTreeClassifier(random_state=42),
    "‚öôÔ∏è Support Vector Machine": SVC(kernel='rbf', random_state=42),
    "üë• K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

# ==========================
# 6Ô∏è‚É£ Train and Evaluate each model
# ==========================
results = []

for name, model in models.items():
    print(f"\n===== Training {name} =====")
    model.fit(X_train_scaled, y_train_resampled)
    y_pred = model.predict(X_test_scaled)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    results.append({"Model": name, "Accuracy": acc})

# ==========================
# 7Ô∏è‚É£ Compare Model Accuracies
# ==========================
results_df = pd.DataFrame(results)
print("\n\nüìä Model Accuracy Comparison:")
print(results_df)

# ==========================
# 8Ô∏è‚É£ Visualize accuracy comparison
# ==========================
plt.figure(figsize=(8,5))
sns.barplot(data=results_df, x="Model", y="Accuracy")
plt.title("Model Accuracy Comparison (After SMOTE)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
