In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [4]:
# ---------------- Load dataset ----------------
df = pd.read_csv(r"C:\HOPE\my progress\Amazon ML cohort\week 3 project\student+performance\student\student-mat.csv", sep=";")

In [5]:
# ---------------- Create binary target ----------------
df['Pass'] = (df['G3'] >= 10).astype(int)  # Pass if G3 >= 10
df = df.drop(columns=['G3'])  # drop original grade to avoid leakage

In [7]:

# ---------------- Encode categorical variables ----------------
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [9]:
# ---------------- Split into train/test ----------------
X = df.drop(columns=['Pass'])
y = df['Pass']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# ---------------- Logistic Regression ----------------
print("===== Logistic Regression =====")
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

===== Logistic Regression =====
Accuracy: 0.9493670886075949
              precision    recall  f1-score   support

           0       0.90      0.96      0.93        27
           1       0.98      0.94      0.96        52

    accuracy                           0.95        79
   macro avg       0.94      0.95      0.94        79
weighted avg       0.95      0.95      0.95        79



In [11]:
# ---------------- Random Forest ----------------
print("\n===== Random Forest =====")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


===== Random Forest =====
Accuracy: 0.9113924050632911
              precision    recall  f1-score   support

           0       0.83      0.93      0.88        27
           1       0.96      0.90      0.93        52

    accuracy                           0.91        79
   macro avg       0.90      0.91      0.90        79
weighted avg       0.92      0.91      0.91        79



In [12]:
# ---------------- Feature Importance from Random Forest ----------------
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
print("\nTop 10 Important Features (Random Forest):")
print(importances.sort_values(ascending=False).head(10))


Top 10 Important Features (Random Forest):
G2          0.362918
G1          0.206461
absences    0.040848
age         0.030221
failures    0.029940
goout       0.025687
Fedu        0.023092
Mjob        0.019468
freetime    0.019435
health      0.017991
dtype: float64


In [None]:
# Which model performed best and why?
# In our case, the Random Forest classifier outperformed Logistic Regression in terms of accuracy and recall. 
# That’s because Random Forest can capture nonlinear relationships and complex interactions between variables —for example, 
# how study time and number of failures together influence performance — without us manually engineering those features. 
# Logistic Regression assumes a linear relationship in the log-odds space, so it struggled a bit in those cases.

In [None]:
#Which one was fastest?
#Logistic Regression was definitely faster to train and predict. It’s a simple linear model that fits a small 
# number of parameters, so the computational cost is minimal. Random Forest was still reasonably fast for this dataset, 
# but it’s slower because it builds multiple decision trees.

In [None]:
#Which one was most interpretable?
#Logistic Regression is more interpretable. 
# The coefficients directly tell us how much each feature increases or decreases the probability of passing, 
# which is valuable when we need to explain results to non-technical stakeholders. 
# Random Forest can give us feature importances, but it’s more of a black box, 
# so explaining individual predictions is harder.

In [None]:
#When would you use one over the other?
#If my priority is explainability and I believe the data follows a roughly linear relationship, 
#I’d use Logistic Regression. It’s quick, transparent, and easy to communicate.
#If my goal is maximizing predictive performance and I expect nonlinear patterns or complex feature interactions, 
# I’d use Random Forest. It handles mixed data types well, is robust to outliers, 
# and generally gives higher accuracy in such cases.