# DS 424 — Lab Final (Topic A)
ID:221-35-938

## Student Exam Result (Tabular Classification)
Followed exactly as in the brief: load & inspect, EDA (≥2 plots), split 80/20, standardize, build a compact ANN (two optimizers × two learning rates, ≤20 epochs), evaluate on test (accuracy, precision, recall, F1) and plot confusion matrix, plus a 3–5 line conclusion.

> If `student_performance.csv` is missing, this notebook will create a synthetic dataset with similar semantics so you can run end-to-end.

In [None]:
import os, warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import tensorflow as tf
from tensorflow import keras
np.random.seed(42)
tf.random.set_seed(42)
print('TensorFlow:', tf.__version__)

### 1) Load & Inspect

In [None]:
csv_path = 'student_performance.csv'
if not os.path.exists(csv_path):
    rng = np.random.default_rng(42)
    n = 300
    hours = rng.uniform(0, 10, size=n)
    attendance = rng.uniform(50, 100, size=n)
    prev = rng.uniform(0, 100, size=n)
    sleep = rng.uniform(4, 9, size=n)
    logits = 0.6*hours + 0.04*attendance + 0.03*prev - 0.1*(8 - sleep) + rng.normal(0,1,n)
    y = (logits > np.median(logits)).astype(int)
    df = pd.DataFrame({
        'Hours_Study': hours,
        'Attendance': attendance,
        'Previous_Score': prev,
        'Sleep_Hours': sleep,
        'Exam_Result': y
    })
    df.to_csv(csv_path, index=False)
    print(f"[INFO] '{csv_path}' not found. Created a synthetic dataset with shape {df.shape}.")
else:
    df = pd.read_csv(csv_path)
    print(f"[INFO] Loaded '{csv_path}' with shape {df.shape}.")
display(df.head())
print('\nShape:', df.shape)
print('\nDtypes:\n', df.dtypes)
print('\nMissing values per column:\n', df.isna().sum())

### 2) EDA — Two Plots

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.hist(df['Hours_Study'], bins=20)
plt.title('Histogram: Hours_Study')
plt.xlabel('Hours_Study'); plt.ylabel('Count')

plt.subplot(1,2,2)
scatter = plt.scatter(df['Hours_Study'], df['Attendance'], c=df['Exam_Result'], s=20)
plt.title('Hours_Study vs Attendance (colored by Exam_Result)')
plt.xlabel('Hours_Study'); plt.ylabel('Attendance (%)')
plt.legend(*scatter.legend_elements(), title='Exam_Result', loc='best')
plt.tight_layout(); plt.show()

### 3) Preprocess — Train/Test Split + Standardize Numeric Features

In [None]:
target = 'Exam_Result'
feature_cols = [c for c in df.columns if c != target]
X = df[feature_cols].values.astype('float32')
y = df[target].values.astype('int32')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train).astype('float32')
X_test_scaled  = scaler.transform(X_test).astype('float32')
print('Train shape:', X_train_scaled.shape, ' Test shape:', X_test_scaled.shape)

### 4) Modeling — Compact Keras ANN; Optimizers × Learning Rates (≤20 epochs)

In [None]:
def build_ann(input_dim: int, lr: float=1e-3, optimizer_name: str='adam'):
    model = keras.Sequential([
        keras.layers.Input(shape=(input_dim,)),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dense(8, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    if optimizer_name.lower() == 'adam':
        opt = keras.optimizers.Adam(learning_rate=lr)
    else:
        opt = keras.optimizers.SGD(learning_rate=lr, momentum=0.9)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    return model

configs = [('adam',1e-3), ('adam',1e-2), ('sgd',1e-3), ('sgd',1e-2)]
results = []
for opt_name, lr in configs:
    model = build_ann(X_train_scaled.shape[1], lr=lr, optimizer_name=opt_name)
    h = model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=20, batch_size=32, verbose=0)
    proba = model.predict(X_test_scaled, verbose=0).ravel()
    y_pred = (proba >= 0.5).astype(int)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    results.append({'optimizer':opt_name,'lr':lr,'accuracy':acc,'precision':prec,'recall':rec,'f1':f1,'cm':cm,'history':h.history})

import pandas as pd
metrics_df = pd.DataFrame([{k:v for k,v in r.items() if k not in ('cm','history')} for r in results])
display(metrics_df.sort_values(['f1','accuracy'], ascending=False))
best_idx = metrics_df.sort_values(['f1','accuracy'], ascending=False).index[0]
best = results[int(best_idx)]
print(f"Best: optimizer={best['optimizer']}  lr={best['lr']}")
print('Test: ACC={:.3f}  PREC={:.3f}  REC={:.3f}  F1={:.3f}'.format(best['accuracy'],best['precision'],best['recall'],best['f1']))

import numpy as np
plt.figure(figsize=(4,4))
plt.imshow(best['cm'], interpolation='nearest')
plt.title('Confusion Matrix (Best Config)')
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ['0','1']); plt.yticks(tick_marks, ['0','1'])
for i in range(2):
    for j in range(2):
        plt.text(j, i, int(best['cm'][i, j]), ha='center', va='center')
plt.ylabel('True label'); plt.xlabel('Predicted label')
plt.tight_layout(); plt.show()

### 5) Conclusion (3–5 lines)

In [None]:
hist = best['history']
train_acc_last = hist['accuracy'][-1]
val_acc_last = hist['val_accuracy'][-1]
gap = train_acc_last - val_acc_last
fit_comment = 'Slight overfitting suspected.' if gap > 0.03 else 'No strong signs of overfitting.'
print(
    f"Best config: {best['optimizer'].upper()} @ lr={best['lr']}. "
    f"Test ACC={best['accuracy']:.3f}, PREC={best['precision']:.3f}, REC={best['recall']:.3f}, F1={best['f1']:.3f}. "
    f"Val acc {val_acc_last:.3f} vs train {train_acc_last:.3f}. {fit_comment}"
)