In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# Goal: Predict whether a student will PASS (1) or FAIL (0) the exam.
# Inputs (features): study_hours, attendance, previous_score
# Output (label): pass_exam

In [5]:
data = {
    "study_hours":      [1, 2, 3, 4, 5, 6, 2, 3, 7, 8, 1, 5, 4, 6, 7, 3, 2, 8, 9, 4],
    "attendance_pct":   [50, 60, 65, 70, 75, 80, 55, 60, 85, 90, 40, 78, 72, 88, 92, 66, 58, 95, 96, 70],
    "previous_score":   [35, 40, 45, 50, 55, 60, 38, 42, 70, 75, 30, 58, 52, 72, 78, 46, 41, 82, 85, 50],
    # 1 = pass, 0 = fail (our label)
    "pass_exam":        [0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1]
}

In [35]:
df = pd.DataFrame(data)
print("First few rows of the dataset:")
print(df.tail())

First few rows of the dataset:
    study_hours  attendance_pct  previous_score  pass_exam
15            3              66              46          0
16            2              58              41          0
17            8              95              82          1
18            9              96              85          1
19            4              70              50          1


In [9]:
# Clean Data
print("\nChecking for missing values:")
print(df.isna().sum())


Checking for missing values:
study_hours       0
attendance_pct    0
previous_score    0
pass_exam         0
dtype: int64


In [11]:
# Remove duplicates (if any)
df = df.drop_duplicates()

In [15]:
# Feature Engineering ===== Very important 
df["study_efficiency"] = df["previous_score"] / df["study_hours"]

print("Data with engineered feature (study_efficiency):")
print(df.head())

Data with engineered feature (study_efficiency):
   study_hours  attendance_pct  previous_score  pass_exam  study_efficiency
0            1              50              35          0              35.0
1            2              60              40          0              20.0
2            3              65              45          0              15.0
3            4              70              50          0              12.5
4            5              75              55          1              11.0


In [17]:
# Separate features (X) and label (y)
feature_cols = ["study_hours", "attendance_pct", "previous_score", "study_efficiency"]
X = df[feature_cols]
y = df["pass_exam"]

In [19]:
# Train / Validation / Test Split
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp
)
# 0.1765 of 85% â‰ˆ 15% of total, so we get ~70/15/15

print(f"\nTrain size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")


Train size: 13, Validation size: 4, Test size: 3


In [23]:
#Select Model & Build Pipeline

model = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression())
])

In [25]:
# Train the Model

model.fit(X_train, y_train)

# Evaluate on validation set (used to tune hyperparameters in real projects)
y_val_pred = model.predict(X_val)

In [27]:
print("\nValidation performance:")
print(confusion_matrix(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, digits=3))


Validation performance:
[[2 0]
 [0 2]]
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         2
           1      1.000     1.000     1.000         2

    accuracy                          1.000         4
   macro avg      1.000     1.000     1.000         4
weighted avg      1.000     1.000     1.000         4



In [29]:
# Final evaluation on the unseen test set
y_test_pred = model.predict(X_test)

print("\nTest performance (on unseen data):")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=3))


Test performance (on unseen data):
[[1 0]
 [0 2]]
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         1
           1      1.000     1.000     1.000         2

    accuracy                          1.000         3
   macro avg      1.000     1.000     1.000         3
weighted avg      1.000     1.000     1.000         3



In [31]:
# Deploy / Monitor (Simulation)
# ==============================
# For demo, we'll simulate "deployment" by predicting for a new student.

new_student = pd.DataFrame({
    "study_hours":      [5],
    "attendance_pct":   [80],
    "previous_score":   [55],
    "study_efficiency": [55/5]
})

prediction = model.predict(new_student)[0]
proba = model.predict_proba(new_student)[0, 1]  # probability of passing

In [33]:
print("\nNew student data:")
print(new_student)

print(f"\nModel prediction (1 = pass, 0 = fail): {prediction}")
print(f"Predicted probability of passing: {proba:.2f}")


New student data:
   study_hours  attendance_pct  previous_score  study_efficiency
0            5              80              55              11.0

Model prediction (1 = pass, 0 = fail): 1
Predicted probability of passing: 0.83
