In [36]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)


# ======================================================
# 2. Load Dataset
# ======================================================
df = pd.read_csv("D:\Downloads\EngStudent_Db(Eng_Stats).csv", sep=";")


# ======================================================
# 3. DATA CLEANING 
# ======================================================


for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].str.replace(",", ".", regex=False)

# Encode Yes / No columns
yes_no_map = {"Yes": 1, "No": 0}
df["feeling_lost"] = df["feeling_lost"].map(yes_no_map)
df["Schedule_effect"] = df["Schedule_effect"].map(yes_no_map)

# Encode Studying Consistency 
study_map = {
    "Daily": 0,
    "Few times during the week": 1,
    "Day before exam": 2
}
df["Studying_Consistency_Encoded"] = df["Studying_Consistency"].map(study_map)

# Encode Gender
df["Gender_Encoded"] = df["Gender"].map({"M": 0, "F": 1})

#numeric conversion
numeric_cols = [
    "Stress_level",
    "Burnout_Score",
    "Average_hours_sleep",
    "Class attendance",
    "Troubled_Modules",
    "Performance_score"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")


# ======================================================
# 4. Features
# ======================================================
features = [
    "Stress_level",
    "Burnout_Score",
    "Average_hours_sleep",
    "Class attendance",
    "Studying_Consistency_Encoded",
    "feeling_lost",
    "Schedule_effect",
    "Troubled_Modules",
    "Gender_Encoded"
]

X = df[features]
y = df["Performance_score"]

# Drop rows with missing target
mask = y.notna()
X = X.loc[mask]
y = y.loc[mask]


# ======================================================
# 5. Train Split
# ======================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


# ======================================================
# 6. LOGISTIC REGRESSION
# ======================================================
log_reg = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000))
])

log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

print("\n===== LOGISTIC REGRESSION =====")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1 Score:", f1_score(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))

# Coefficients
log_coef = log_reg.named_steps["model"].coef_[0]
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": log_coef
}).sort_values(by="Coefficient", ascending=False)

print("\nLogistic Regression Coefficients:")
print(coef_df)


# ======================================================
# 7. DECISION TREE
# ======================================================
tree = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", DecisionTreeClassifier(
        max_depth=4,
        min_samples_leaf=10,
        random_state=42
    ))
])

tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

print("\n===== DECISION TREE =====")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Precision:", precision_score(y_test, y_pred_tree))
print("Recall:", recall_score(y_test, y_pred_tree))
print("F1 Score:", f1_score(y_test, y_pred_tree))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))


# ======================================================
# 8. RANDOM FOREST
# ======================================================
rf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", RandomForestClassifier(
        n_estimators=200,
        max_depth=6,
        min_samples_leaf=10,
        random_state=42
    ))
])

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\n===== RANDOM FOREST =====")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# Feature importance
rf_importance = rf.named_steps["model"].feature_importances_
importance_df = pd.DataFrame({
    "Feature": features,
    "Importance": rf_importance
}).sort_values(by="Importance", ascending=False)

print("\nRandom Forest Feature Importance:")
print(importance_df)



===== LOGISTIC REGRESSION =====
Accuracy: 0.7931034482758621
Precision: 0.7333333333333333
Recall: 0.8461538461538461
F1 Score: 0.7857142857142857
Confusion Matrix:
 [[12  4]
 [ 2 11]]

Logistic Regression Coefficients:
                        Feature  Coefficient
1                 Burnout_Score     0.586117
5                  feeling_lost     0.510894
8                Gender_Encoded     0.495754
0                  Stress_level     0.455024
4  Studying_Consistency_Encoded     0.432625
6               Schedule_effect     0.238372
3              Class attendance     0.043694
7              Troubled_Modules    -0.102027
2           Average_hours_sleep    -0.569232

===== DECISION TREE =====
Accuracy: 0.7586206896551724
Precision: 0.8
Recall: 0.6153846153846154
F1 Score: 0.6956521739130435
Confusion Matrix:
 [[14  2]
 [ 5  8]]

===== RANDOM FOREST =====
Accuracy: 0.8620689655172413
Precision: 0.9090909090909091
Recall: 0.7692307692307693
F1 Score: 0.8333333333333334
Confusion Matrix:
 [[1

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values