# Student Score Prediction: EDA and Modeling

This notebook demonstrates the complete workflow for predicting student exam scores:
1. Data loading and exploration
2. Basic EDA visualizations
3. Linear regression baseline
4. Polynomial regression with degree selection
5. Model comparison and insights

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
import sys
sys.path.append('..')
from src.data import resolve_data_path, load_data, clean_data, split_data
from src.features import build_poly
from src.modeling import train_linear, compute_metrics, cv_select_poly_degree
from src.plots import scatter_xy, pred_vs_actual, residuals
from src.config import DEFAULT_DATA_PATH, DEMO_DATA_PATH, DEFAULT_TARGET, DEFAULT_FEATURES

# Set up plotting
plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 100

## 1. Data Loading and Preview

In [None]:
# Resolve data path (will create demo if needed)
data_path = resolve_data_path(DEFAULT_DATA_PATH, DEMO_DATA_PATH)
print(f"Using dataset: {data_path}")

# Load data
df = load_data(data_path)
print(f"Dataset loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print("=" * 40)
df.info()

print("\nFirst few rows:")
print("=" * 40)
print(df.head())

print("\nBasic statistics:")
print("=" * 40)
print(df.describe())

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Plot histograms of key variables
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

columns_to_plot = ['study_hours', 'sleep_hours', 'attendance', 'participation', 'final_score']
for i, col in enumerate(columns_to_plot):
    if col in df.columns:
        axes[i].hist(df[col], bins=20, alpha=0.7, edgecolor='black')
        axes[i].set_title(f'{col.replace("_", " ").title()} Distribution')
        axes[i].set_xlabel('Value')
        axes[i].set_ylabel('Frequency')
        axes[i].grid(True, alpha=0.3)

# Hide the last subplot if not used
if len(columns_to_plot) < len(axes):
    axes[-1].set_visible(False)

plt.tight_layout()
plt.suptitle('Variable Distributions', fontsize=16, y=1.02)
plt.savefig('../outputs/figures/notebook_histograms.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Scatter plot: Study Hours vs Final Score
plt.figure(figsize=(10, 6))
plt.scatter(df['study_hours'], df['final_score'], alpha=0.6, edgecolors='black', linewidth=0.5)
plt.xlabel('Study Hours')
plt.ylabel('Final Score')
plt.title('Study Hours vs Final Score')
plt.grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(df['study_hours'], df['final_score'], 1)
p = np.poly1d(z)
plt.plot(df['study_hours'], p(df['study_hours']), "r--", alpha=0.8, linewidth=2, label='Trend Line')
plt.legend()
plt.savefig('../outputs/figures/notebook_scatter.png', dpi=150, bbox_inches='tight')
plt.show()

# Calculate correlation
correlation = df['study_hours'].corr(df['final_score'])
print(f"Correlation between Study Hours and Final Score: {correlation:.3f}")

In [None]:
# Correlation matrix (numerical only)
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()

print("Correlation Matrix:")
print("=" * 50)
print(correlation_matrix.round(3))

# Find strongest correlations with final_score
if 'final_score' in correlation_matrix.columns:
    correlations = correlation_matrix['final_score'].abs().sort_values(ascending=False)
    print(f"\nStrongest correlations with Final Score:")
    print("=" * 50)
    for feature, corr in correlations.items():
        if feature != 'final_score':
            print(f"{feature}: {corr:.3f}")

## 3. Data Preparation

In [None]:
# Clean data and split into train/test sets
features = ['study_hours']  # Start with single feature
target = 'final_score'

print(f"Using features: {features}")
print(f"Target variable: {target}")

# Clean and split data
df_clean = clean_data(df, target, features)
X_train, X_test, y_train, y_test = split_data(
    df_clean, features, target, test_size=0.2, random_state=42
)

print(f"\nTraining set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

## 4. Linear Regression Baseline

In [None]:
# Train linear regression model
print("Training Linear Regression Model...")
print("=" * 40)

linear_model = train_linear(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Compute metrics
metrics_linear = compute_metrics(y_test, y_pred_linear)

print("Linear Regression Results:")
for metric, value in metrics_linear.items():
    print(f"{metric.upper()}: {value:.4f}")

print(f"\nModel coefficients:")
print(f"Intercept: {linear_model.intercept_:.4f}")
for i, coef in enumerate(linear_model.coef_):
    print(f"{features[i]}: {coef:.4f}")

In [None]:
# Plot predictions vs actual for linear model
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Predictions vs Actual
ax1.scatter(y_test, y_pred_linear, alpha=0.6, edgecolors='black', linewidth=0.5)
min_val = min(y_test.min(), y_pred_linear.min())
max_val = max(y_test.max(), y_pred_linear.max())
ax1.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title('Linear Regression: Predictions vs Actual')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Residuals
residuals_linear = y_test - y_pred_linear
ax2.scatter(y_pred_linear, residuals_linear, alpha=0.6, edgecolors='black', linewidth=0.5)
ax2.axhline(y=0, color='r', linestyle='--', lw=2)
ax2.set_xlabel('Predicted Values')
ax2.set_ylabel('Residuals')
ax2.set_title('Linear Regression: Residual Plot')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/notebook_linear_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Polynomial Regression

In [None]:
# Select best polynomial degree using cross-validation
print("Selecting optimal polynomial degree...")
print("=" * 40)

# Combine train and test for CV (we'll split again after)
X_full = np.vstack([X_train, X_test])
y_full = np.concatenate([y_train, y_test])

cv_result = cv_select_poly_degree(X_full, y_full, degrees=[2, 3, 4, 5], k=5, random_state=42)
best_degree = cv_result['best_degree']

print(f"Cross-validation results:")
for degree, result in cv_result['cv_results'].items():
    print(f"Degree {degree}: RMSE = {result['mean_rmse']:.4f} ± {result['std_rmse']:.4f}")

print(f"\nBest degree selected: {best_degree}")

In [None]:
# Train polynomial model with best degree
print(f"\nTraining Polynomial Regression (degree={best_degree})...")
print("=" * 50)

# Create polynomial features
X_train_poly = build_poly(X_train, best_degree)
X_test_poly = build_poly(X_test, best_degree)

# Train model (linear regression on polynomial features)
poly_model = train_linear(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)

# Compute metrics
metrics_poly = compute_metrics(y_test, y_pred_poly)

print(f"Polynomial Regression Results (degree={best_degree}):")
for metric, value in metrics_poly.items():
    print(f"{metric.upper()}: {value:.4f}")

print(f"\nNumber of polynomial features: {X_train_poly.shape[1]}")

In [None]:
# Plot predictions vs actual for polynomial model
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Predictions vs Actual
ax1.scatter(y_test, y_pred_poly, alpha=0.6, edgecolors='black', linewidth=0.5, color='orange')
min_val = min(y_test.min(), y_pred_poly.min())
max_val = max(y_test.max(), y_pred_poly.max())
ax1.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title(f'Polynomial Regression (deg={best_degree}): Predictions vs Actual')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Residuals
residuals_poly = y_test - y_pred_poly
ax2.scatter(y_pred_poly, residuals_poly, alpha=0.6, edgecolors='black', linewidth=0.5, color='orange')
ax2.axhline(y=0, color='r', linestyle='--', lw=2)
ax2.set_xlabel('Predicted Values')
ax2.set_ylabel('Residuals')
ax2.set_title(f'Polynomial Regression (deg={best_degree}): Residual Plot')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/notebook_poly_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Model Comparison

In [None]:
# Compare metrics between linear and polynomial models
print("Model Comparison:")
print("=" * 50)

metrics_df = pd.DataFrame({
    'Linear': [metrics_linear['mae'], metrics_linear['mse'], metrics_linear['rmse'], metrics_linear['r2']],
    'Polynomial': [metrics_poly['mae'], metrics_poly['mse'], metrics_poly['rmse'], metrics_poly['r2']]
}, index=['MAE', 'MSE', 'RMSE', 'R²'])

print(metrics_df.round(4))

# Calculate improvement
improvement = {}
for metric in ['mae', 'mse', 'rmse']:
    # For error metrics, lower is better
    improvement[metric] = (metrics_linear[metric] - metrics_poly[metric]) / metrics_linear[metric] * 100

# For R², higher is better
improvement['r2'] = (metrics_poly['r2'] - metrics_linear['r2']) / metrics_linear['r2'] * 100

print(f"\nImprovement from Linear to Polynomial:")
for metric, imp in improvement.items():
    direction = "reduction" if metric != 'r2' else "increase"
    print(f"{metric.upper()}: {imp:+.2f}% {direction}")

In [None]:
# Visualize metric comparison
metrics = ['MAE', 'MSE', 'RMSE', 'R²']
linear_values = [metrics_linear['mae'], metrics_linear['mse'], metrics_linear['rmse'], metrics_linear['r2']]
poly_values = [metrics_poly['mae'], metrics_poly['mse'], metrics_poly['rmse'], metrics_poly['r2']]

x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width/2, linear_values, width, label='Linear', alpha=0.8)
bars2 = ax.bar(x + width/2, poly_values, width, label='Polynomial', alpha=0.8)

ax.set_xlabel('Metrics')
ax.set_ylabel('Values')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.grid(True, alpha=0.3)

# Add value labels on bars
def add_value_labels(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                   xy=(bar.get_x() + bar.get_width() / 2, height),
                   xytext=(0, 3),  # 3 points vertical offset
                   textcoords="offset points",
                   ha='center', va='bottom', fontsize=9)

add_value_labels(bars1)
add_value_labels(bars2)

plt.tight_layout()
plt.savefig('../outputs/figures/notebook_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. What We Learned

### Key Findings:

1. **Linear Baseline**: The linear regression model provides a solid baseline for predicting student scores based on study hours.

2. **Polynomial Enhancement**: Adding polynomial features can capture non-linear relationships in the data, potentially improving prediction accuracy.

3. **Bias-Variance Tradeoff**: Higher polynomial degrees may improve training performance but risk overfitting. Cross-validation helps select the optimal degree.

4. **Feature Importance**: Study hours typically show the strongest correlation with final scores, but other factors like sleep, attendance, and participation can also be significant.

### Recommendations:

- **Model Selection**: Use cross-validation to select the optimal polynomial degree
- **Feature Engineering**: Consider interactions between features and domain-specific transformations
- **Data Collection**: Gather more diverse features that might influence student performance
- **Regularization**: For higher-degree polynomials, consider Ridge or Lasso regression to prevent overfitting

### Next Steps:

- Collect more data points to improve model robustness
- Experiment with other algorithms (Random Forest, Gradient Boosting, Neural Networks)
- Perform feature selection to identify the most important predictors
- Validate the model on completely unseen data