<a href="https://colab.research.google.com/github/JohenPerera123/Real_World_Project_-for_REPORT/blob/main/GPA_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [None]:
file = "/content/drive/MyDrive/student_lifestyle_dataset.csv"

In [None]:
df = pd.read_csv(file)
df.head()

In [None]:
df.info()

In [None]:
#get the descriptive statistics of the dataset
df.describe()

In [None]:
#check get the the null value count
df.isnull().sum()

In [None]:
#check duplicate values and get count
df.duplicated().sum()

#Detect ouliers


In [None]:
#calculate the basic measures and detect outliers of each column
for col in df.columns:

    # Skip the Stress_Level column
    if col == "Stress_Level":
        continue

    print("Column name:", col)

    # calculate mean, median, Q1, Q3, IQR, limits
    mean = df[col].mean()
    median = df[col].median()
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR

    # Detect Outliers
    outliers = df[(df[col] < lower_limit) | (df[col] > upper_limit)]

    print(f"Mean: {mean}")
    print(f"Median: {median}")
    print(f"Q1 (25%): {Q1}")
    print(f"Q3 (75%): {Q3}")
    print(f"IQR (Q3 - Q1): {IQR}")
    print(f"Lower Limit: {lower_limit}")
    print(f"Upper Limit: {upper_limit}")
    print(f"Number of Outliers: {len(outliers)}")
    print("\n")

In [None]:
import warnings
warnings.filterwarnings("ignore")

# Columns to plot (skip Stress_Level)
cols_to_plot = [col for col in df.columns if col != "Stress_Level"]

n = len(cols_to_plot)  # number of plots
rows = (n + 2) // 3    # dynamic row count (3 plots per row)

plt.figure(figsize=(15, 5 * rows))

for i, col in enumerate(cols_to_plot, 1):
    plt.subplot(rows, 3, i)
    sns.boxplot(y=df[col])
    plt.title(col)

plt.tight_layout()
plt.show()



In [None]:
# List of columns except Stress_Level
cols_to_check = [col for col in df.columns if col != "Stress_Level"]

# Start with full dataset
clean_df = df.copy()

for col in cols_to_check:
    Q1 = clean_df[col].quantile(0.25)
    Q3 = clean_df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    # Keep only rows that are NOT outliers
    clean_df = clean_df[(clean_df[col] >= lower) & (clean_df[col] <= upper)]

# Final cleaned dataset
clean_df.reset_index(drop=True, inplace=True)

clean_df


In [None]:
df = clean_df
df.head()

#Normalize or Standardize Features


Standardization (Z-score)

- z=(x‚àíŒº‚Äã)/œÉ
- ¬µ = mean
- œÉ = standard deviation

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Select numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Remove GPA from scaling
cols_to_scale = [col for col in numeric_cols if col != "GPA"]

df_standardized = df.copy()

# Apply StandardScaler only on selected columns
df_standardized[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

df_standardized.head()


Normalization (0‚Äì1)

- x‚Ä≤=(max‚àímin)/(x‚àímin‚Äã)

In [None]:
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()

numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Remove GPA from scaling
cols_to_scale = [col for col in numeric_cols if col != "GPA"]

df_normalized = df.copy()
df_standardized[cols_to_scale] = normalizer.fit_transform(df[cols_to_scale])


df_normalized.head()

###Mapping strees level

In [None]:
stress_level_mapping = {'Low': 1, 'Moderate': 2, 'High': 3}

df_normalized['Stress_Level'] = df_normalized['Stress_Level'].map(stress_level_mapping)

df_normalized.head()

In [None]:
df=df_normalized
df = df.drop("Student_ID", axis=1)
df.head()

##Final Cleaned Dataset

- No missing values

- No duplicates

- Outliers handled

- All features scaled

#Analytics Framework


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

##New features
- df["study_stress_ratio"] = df["Study_Hours_Per_Day"] / (df["Stress_Level"] + 1)
- df['Total_Activity_Hours'] = df['Extracurricular_Hours_Per_Day'] + df['Social_Hours_Per_Day'] + df['Physical_Activity_Hours_Per_Day']
- df['Study_to_Sleep_Ratio'] = df['Study_Hours_Per_Day'] / df['Sleep_Hours_Per_Day']

In [None]:
df["study_stress_ratio"] = df["Study_Hours_Per_Day"] / (df["Stress_Level"] + 1)
df['Total_Activity_Hours'] = df['Extracurricular_Hours_Per_Day'] + df['Social_Hours_Per_Day'] + df['Physical_Activity_Hours_Per_Day']
df['Study_to_Sleep_Ratio'] = df['Study_Hours_Per_Day'] / df['Sleep_Hours_Per_Day']
df.head()

In [None]:
# correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()

##Define Features (X) and Target (y)



In [None]:
X = df[['Study_Hours_Per_Day',	'Extracurricular_Hours_Per_Day',	'Sleep_Hours_Per_Day',	'Social_Hours_Per_Day',	'Physical_Activity_Hours_Per_Day', 'Stress_Level', 'study_stress_ratio',	'Total_Activity_Hours',	'Study_to_Sleep_Ratio']]

y = df['GPA']

#Split Train‚ÄìTest Data


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(y_test)


#Scale the Features


In [None]:
scaler = StandardScaler()

# Fit on training data only
X_train_scaled = scaler.fit_transform(X_train)
# Transform test data using the already learned parameters
X_test_scaled = scaler.transform(X_test)


##Train Multiple Models


###Model 1: Linear Regression



In [None]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train_scaled, y_train)
y_pred_linear=linear.predict(X_test_scaled)

###Model 2: Ridge Regression



In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge=ridge.predict(X_test_scaled)

###Model 3: Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso=lasso.predict(X_test_scaled)

###Model 4: Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=4)  # tune depth to show overfitting
dt.fit(X_train, y_train)
y_pred_dt=dt.predict(X_test_scaled)

###Model 5: Random Forest Regressor



In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

###Model 6: Gradient Boosting Regressor



In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train_scaled, y_train)
y_pred_gbr = gbr.predict(X_test_scaled)

#Evaluate All Models


In [None]:
def evaluate(model, name):
    y_pred_train = model.predict(X_train_scaled if 'Tree' not in name else X_train)
    y_pred_test = model.predict(X_test_scaled if 'Tree' not in name else X_test)

    return {
        'Model': name,
        'Train R¬≤': r2_score(y_train, y_pred_train),
        'Test R¬≤': r2_score(y_test, y_pred_test),
        'Train MSE': mean_squared_error(y_train, y_pred_train),
        'Test MSE': mean_squared_error(y_test, y_pred_test)
    }

results = []
results.append(evaluate(linear, 'Linear Regression'))
results.append(evaluate(ridge, 'Ridge Regression'))
results.append(evaluate(lasso, 'Lasso Regression'))
results.append(evaluate(dt, 'Decision Tree Regressor'))
results.append(evaluate(rf, 'Random Forest Regressor'))
results.append(evaluate(gbr, 'Gradient Boosting Regressor'))

model_results = pd.DataFrame(results)
model_results

#Interpretation: Checking Overfitting & Underfitting

##Overfitting

When a model learns the training data too well, including noise and minor fluctuations, so it performs poorly on new/unseen data.

Train performance >> Test performance (large gap between Train R¬≤ and Test R¬≤)

Train error is very low, but test error is high.

##Underfitting

When a model is too simple to capture the underlying patterns in the data, so it performs poorly on both training and test data.

Train performance is low, and Test performance is also low

Train error and test error are both high.

Sometimes Test R¬≤ may be slightly higher than Train R¬≤ if the model is unstable.

#Visualization


##Actual vs Predicted plots



In [None]:
# Dictionary of model predictions
models = {
    "Linear Regression": y_pred_linear,
    "Ridge Regression": y_pred_ridge,
    "Lasso Regression": y_pred_lasso,
    "Decision Tree": y_pred_dt,
    "Random Forest": y_pred_rf,
    "Gradient Boosting": y_pred_gbr
}

x = range(len(y_test))

# Number of models
num_models = len(models)

# Create subplots
plt.figure(figsize=(15, 20))

for i, (model_name, preds) in enumerate(models.items(), 1):
    plt.subplot(3, 2, i)  # 3 rows, 2 columns
    plt.plot(x, y_test, label='Actual', linewidth=2)
    plt.plot(x, preds, label='Predicted', linestyle='--')
    plt.title(f"Actual vs Predicted - {model_name}")
    plt.xlabel("Sample Index")
    plt.ylabel("GPA")
    plt.legend()
    plt.grid(True)

plt.tight_layout()
plt.show()

##Observations

1. Linear, Ridge, Lasso Regression

- Train and test R¬≤ are similar (~0.53‚Äì0.54), indicating no overfitting.

- Test MSE is slightly higher than train MSE, but overall performance is moderate.

2. Decision Tree Regressor

- Train R¬≤ = 0.558, Test R¬≤ = 0.497 ‚Üí slight overfitting.

- Test MSE is higher than linear models, suggesting less generalization.

3. Random Forest Regressor

- Train R¬≤ = 0.930, Test R¬≤ = 0.445 ‚Üí strong overfitting.

- Very low train MSE (0.006) but high test MSE (0.051), indicating poor generalization.

4. Gradient Boosting Regressor

- Train R¬≤ = 0.646, Test R¬≤ = 0.498 ‚Üí mild overfitting.

- Performs slightly better than Decision Tree on test set but worse than linear models in R¬≤ stability.

##Key Takeaways

- Linear models (Linear, Ridge, Lasso) give stable and consistent results, even if moderate.

- Ensemble models like Random Forest overfit the data severely due to small sample size or noisy features.

- Gradient Boosting improves slightly over single Decision Tree but still overfits.

- For this dataset, simple linear models may generalize better, while more complex models require feature engineering or regularization.

##Prediction Error Plots



In [None]:
models = {
    "Linear Regression": y_pred_linear,
    "Ridge Regression": y_pred_ridge,
    "Lasso Regression": y_pred_lasso,
    "Decision Tree": y_pred_dt,
    "Random Forest": y_pred_rf,
    "Gradient Boosting": y_pred_gbr
}



plt.figure(figsize=(15, 20))

for i, (name, preds) in enumerate(models.items(), 1):
    plt.subplot(3, 2, i)
    plt.scatter(y_test, preds, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()],
             [y_test.min(), y_test.max()],
             color='red', linewidth=2)
    plt.title(f"Prediction Error Plot - {name}")
    plt.xlabel("Actual GPA")
    plt.ylabel("Predicted GPA")
    plt.grid(True)

plt.tight_layout()
plt.show()

| Model             | Expectation on Scatter Plot                                                                                        |
| ----------------- | ------------------------------------------------------------------------------------------------------------------ |
| Linear Regression | Moderate scatter, points relatively close to line (consistent with Test R¬≤ ~0.53)                                  |
| Ridge / Lasso     | Very similar to Linear Regression (regularized versions)                                                           |
| Decision Tree     | Slightly more spread than linear models, overfitting visible as some points perfectly predicted and others far off |
| Random Forest     | Very tight on some train points, but large scatter on test points (overfitting)                                    |
| Gradient Boosting | Moderate scatter, slightly better than Decision Tree, but still spread wider than linear models                    |

##Summary

- Best alignment with red line: Linear, Ridge, Lasso ‚Üí best generalization.

- High variance: Random Forest ‚Üí overfitting, predictions deviate more from actual.

- Intermediate: Decision Tree and Gradient Boosting ‚Üí slightly overfit but better than Random Forest on test set.

##Error Distribution



In [None]:
plt.figure(figsize=(15, 20))

for i, (name, preds) in enumerate(models.items(), 1):
    plt.subplot(3, 2, i)
    residuals = y_test - preds
    sns.histplot(residuals, kde=True, bins=20)
    plt.title(f"Error Distribution - {name}")
    plt.xlabel("Error (Residual)")
    plt.ylabel("Frequency")
    plt.grid(True)

plt.tight_layout()
plt.show()

Observations from the Error Distribution Plots
1. Linear, Ridge, Lasso Regression

- Residuals are roughly centered around 0, forming a bell-shaped distribution.

- Spread is moderate, indicating consistent prediction errors.

- No extreme outliers in residuals ‚Üí models generalize well.

- This aligns with your Test R¬≤ (~0.53) and low Test MSE (~0.043).

2. Decision Tree

- Residuals are skewed to the right with a long tail, meaning some predictions are significantly overestimated.

- Some residuals are near 0 (perfect predictions for some points), showing overfitting to training data.

- Test performance is worse than linear models (Test R¬≤ ~0.497).

3. Random Forest

- Residuals are mostly centered around 0, but there are some long tails ‚Üí indicates overfitting on training patterns but struggles to generalize.

- Test R¬≤ is low (~0.445), showing poor generalization.

4. Gradient Boosting

- Residuals are centered near 0 with moderate spread.

- Slightly better distribution than Decision Tree, but still more variance than linear models.

- Test R¬≤ ~0.498 ‚Üí mild overfitting, better than Decision Tree but worse than Linear/Ridge/Lasso.

‚úÖ Key Takeaways

1. Linear, Ridge, and Lasso Regression

- Most stable, unbiased, and generalizes best.

- Residuals are symmetric, centered at 0, and moderate in spread.

2. Decision Tree and Gradient Boosting

- Show overfitting patterns: spikes at 0 and long tails.

3. Random Forest

- Severe overfitting with long tails in residuals ‚Üí poor test performance.

##Model Comparison Table



In [None]:
from sklearn.metrics import mean_absolute_error
metrics = {"Model": [], "R2": [], "MAE": [], "RMSE": []}

for name, preds in models.items():
    metrics["Model"].append(name)
    metrics["R2"].append(r2_score(y_test, preds))
    metrics["MAE"].append(mean_absolute_error(y_test, preds))
    metrics["RMSE"].append(np.sqrt(mean_squared_error(y_test, preds)))

df_metrics = pd.DataFrame(metrics)
df_metrics

##Model Comparison Bar Charts



R¬≤, MAE,RMSE Comparison

In [None]:
import matplotlib.pyplot as plt

# Model names
models = ["Linear Regression", "Ridge Regression", "Lasso Regression",
          "Decision Tree", "Random Forest", "Gradient Boosting"]

# Metrics
r2 = [0.532801, 0.533004, 0.533649, -1.568823, 0.444994, 0.498066]
mae = [0.167725, 0.167702, 0.167046, 0.410984, 0.182283, 0.174521]
rmse = [0.207412, 0.207367, 0.207223, 0.486351, 0.226064, 0.214984]

# ----------------- R¬≤ Bar Chart -----------------
plt.figure(figsize=(10,6))
plt.bar(models, r2, color='skyblue')
plt.title("R¬≤ of Models")
plt.ylabel("R¬≤")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.show()

# ----------------- MAE Bar Chart -----------------
plt.figure(figsize=(10,6))
plt.bar(models, mae, color='salmon')
plt.title("MAE of Models")
plt.ylabel("Mean Absolute Error (MAE)")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.show()

# ----------------- RMSE Bar Chart -----------------
plt.figure(figsize=(10,6))
plt.bar(models, rmse, color='lightgreen')
plt.title("RMSE of Models")
plt.ylabel("Root Mean Squared Error (RMSE)")
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.show()


1Ô∏è‚É£ R¬≤ (Coefficient of Determination)

- Measures how much of the variation in GPA is explained by the model.

- Closer to 1 ‚Üí better model, negative values ‚Üí worse than predicting the mean.
| Model             | R¬≤     | Comment                                                                   |
| ----------------- | ------ | ------------------------------------------------------------------------- |
| Linear Regression | 0.533  | Explains ~53% of GPA variation. Good and stable.                          |
| Ridge Regression  | 0.533  | Almost same as Linear. Regularization didn‚Äôt change much.                 |
| Lasso Regression  | 0.534  | Slightly better than Linear/Ridge. Best generalization.                   |
| Decision Tree     | -1.569 | Very poor. Model is overfitting badly.                                    |
| Random Forest     | 0.445  | Moderate. Some overfitting; doesn‚Äôt generalize well.                      |
| Gradient Boosting | 0.498  | Better than Random Forest and Decision Tree but worse than linear models. |

2Ô∏è‚É£ MAE (Mean Absolute Error)

- Measures average absolute difference between predicted and actual GPA.

- Lower MAE ‚Üí better predictions.
| Model             | MAE   | Comment                                                     |
| ----------------- | ----- | ----------------------------------------------------------- |
| Linear Regression | 0.168 | Small error; good predictions.                              |
| Ridge Regression  | 0.168 | Almost identical to Linear.                                 |
| Lasso Regression  | 0.167 | Slightly better; lowest MAE.                                |
| Decision Tree     | 0.411 | Very high error; predictions are unreliable.                |
| Random Forest     | 0.182 | Moderate error; overfitting may cause large deviations.     |
| Gradient Boosting | 0.175 | Better than Random Forest; still higher than linear models. |

3Ô∏è‚É£ RMSE (Root Mean Squared Error)

- Measures average magnitude of errors, giving more weight to large errors.

- Lower RMSE ‚Üí better model, sensitive to extreme predictions.
| Model             | RMSE  | Comment                                                        |
| ----------------- | ----- | -------------------------------------------------------------- |
| Linear Regression | 0.207 | Good overall; consistent.                                      |
| Ridge Regression  | 0.207 | Very similar to Linear.                                        |
| Lasso Regression  | 0.207 | Slightly better than Linear; fewest large errors.              |
| Decision Tree     | 0.486 | Very high; some predictions are far from actual.               |
| Random Forest     | 0.226 | Moderate; better than Decision Tree, worse than linear models. |
| Gradient Boosting | 0.215 | Slightly better than Random Forest; still higher than Lasso.   |


1. Best Model: Lasso Regression

Best R¬≤, lowest MAE, and RMSE.

Stable predictions, generalizes well.

2. Linear & Ridge Regression

Close to Lasso. Slightly worse in MAE/RMSE.

3. Ensemble Models (Random Forest, Gradient Boosting)

Overfit on training data ‚Üí worse performance on test data.

Could improve with hyperparameter tuning.

4. Decision Tree

Worst overall; very unstable and overfits easily.

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

def evaluate_model(model, name, X_train, X_test, y_train, y_test, X_train_scaled=None, X_test_scaled=None):
    """
    Evaluate regression model performance and detect overfitting/underfitting.

    Parameters:
    - model: trained model
    - name: model name (string)
    - X_train, X_test, y_train, y_test: training and test data
    - X_train_scaled, X_test_scaled: optional scaled data for linear models

    Returns:
    - dict with R¬≤, MSE, and generalization note
    """

    # Use scaled data for non-tree models, raw data for tree-based models
    if 'Tree' in name or 'Forest' in name or 'Boosting' in name:
        X_tr, X_te = X_train, X_test
    else:
        X_tr, X_te = X_train_scaled, X_test_scaled

    # Predictions
    y_pred_train = model.predict(X_tr)
    y_pred_test = model.predict(X_te)

    # Metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)

    # Check generalization
    diff = train_r2 - test_r2
    if diff > 0.05:
        note = "Overfitting (Train >> Test)"
    elif diff < -0.05:
        note = "Underfitting (Test > Train)"
    else:
        note = "Good generalization"

    return {
        'Model': name,
        'Train R¬≤': round(train_r2, 3),
        'Test R¬≤': round(test_r2, 3),
        'Train MSE': round(train_mse, 4),
        'Test MSE': round(test_mse, 4),
        'Notes': note
    }

# Evaluate all models
results = [
    evaluate_model(linear, 'Linear Regression', X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled),
    evaluate_model(ridge, 'Ridge Regression', X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled),
    evaluate_model(lasso, 'Lasso Regression', X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled),
    evaluate_model(dt, 'Decision Tree Regressor', X_train, X_test, y_train, y_test),
    evaluate_model(rf, 'Random Forest Regressor', X_train, X_test, y_train, y_test),
    evaluate_model(gbr, 'Gradient Boosting Regressor', X_train, X_test, y_train, y_test)
]

# Convert to DataFrame
model_results = pd.DataFrame(results)
model_results


##Key Metrics Explained

###R¬≤ (Coefficient of Determination)

- Measures how well the model explains the variance in the data.

- Closer to 1 ‚Üí better. Closer to 0 ‚Üí poor fit. Negative ‚Üí model is worse than predicting the mean.

###MSE (Mean Squared Error)

- Measures the average squared difference between predicted and actual values.

- Smaller ‚Üí better.

Notes

- Indicates whether the model is overfitting, underfitting, or generalizing well.

##Model-by-Model Summary

###Linear, Ridge, Lasso Regression

- R¬≤: ~0.53 ‚Üí explains about 53% of the variation in GPA.

- MSE: ~0.04 ‚Üí predictions are reasonably close to actual values.

Notes: "Good generalization" ‚Üí Train & Test R¬≤ are close, so no overfitting or underfitting.

Interpretation: These linear models are stable, reliable, and simple, giving consistent results.

### Decision Tree Regressor

- Train R¬≤ = 0.558 (slightly higher than linear models)

- Test R¬≤ = 0.497 (drop from train ‚Üí small overfitting)

- Train MSE < Test MSE ‚Üí small overfitting.

Notes: "Overfitting (Train >> Test)" ‚Üí tree memorized some patterns in training but doesn‚Äôt generalize perfectly.

Interpretation: Single decision trees are unstable and can overfit small datasets.

### Random Forest Regressor

- Train & Test R¬≤ are negative ‚Üí predicts worse than simply taking the mean GPA.

- MSE is high ‚Üí very poor performance.

Notes: "Underfitting" ‚Üí model is too conservative; maybe hyperparameters need tuning or dataset is small.

### Gradient Boosting Regressor

- Train & Test R¬≤ are negative but slightly better than Random Forest.

- Slight underfitting ‚Üí model not capturing the patterns well.

Could improve with learning rate adjustment, more estimators, or max_depth tuning.

##Overall Takeaways

### - Best models: Lasso Regression, Linear Regression, Ridge Regression ‚Üí good balance between Train/Test performance.

###Tree-based models (Decision Tree, Random Forest, Gradient Boosting)

- Decision Tree: slight overfitting.

- Random Forest & Gradient Boosting: underfitting (poor R¬≤).

Reason: Dataset may be small or features are mostly linear ‚Üí linear models perform better.

#Interpretability & Evaluation


In [None]:
# Define the best model as Ridge Regression
best_model = Ridge(alpha=0.2, random_state=42)  # you can adjust alpha

# Fit the model
best_model.fit(X_train_scaled, y_train)

#FEATURE IMPORTANCE (FOR BEST MODEL)


In [None]:
coefficients = best_model.coef_
features = X_train.columns

# Create a DataFrame
feat_df = pd.DataFrame({'Feature': features, 'Importance': np.abs(coefficients)})
feat_df = feat_df.sort_values(by='Importance', ascending=False)
print(feat_df)

# Plot
plt.figure(figsize=(8,6))
sns.barplot(x='Importance', y='Feature', data=feat_df, palette='viridis')
plt.title("Feature Importance in Lasso Regression")
plt.show()

# Ridge Regression Feature Importance Analysis

## Model Overview

You trained a **Ridge Regression** model with:

- `alpha = 0.2` (regularization strength)
- Features scaled before fitting
- Target variable: [your target variable, e.g., GPA]

Ridge Regression is a **linear model** that penalizes large coefficients using **L2 regularization**. The `alpha` value controls how strong this penalty is:

- **Higher alpha** ‚Üí stronger penalty ‚Üí coefficients shrink toward zero
- **Lower alpha** ‚Üí weaker penalty ‚Üí coefficients closer to ordinary linear regression

With `alpha = 0.2`, the penalty is moderate, allowing coefficients to reflect feature importance without being overly shrunk.

---

## Feature Importance Table

| Feature                        | Importance |
|--------------------------------|-----------|
| Study_Hours_Per_Day             | 0.135783  |
| Stress_Level                    | 0.094967  |
| Sleep_Hours_Per_Day             | 0.075136  |
| study_stress_ratio              | 0.063824  |
| Study_to_Sleep_Ratio            | 0.043547  |
| Total_Activity_Hours            | 0.040677  |
| Extracurricular_Hours_Per_Day   | 0.018979  |
| Physical_Activity_Hours_Per_Day | 0.017418  |
| Social_Hours_Per_Day            | 0.011150  |

**Explanation of Columns:**

- **Feature**: Name of the input variable.
- **Importance**: Magnitude of the Ridge Regression coefficient (absolute value).  
  - Ridge coefficients can be positive or negative.  
  - Using `np.abs()` allows us to compare their **relative influence** on the target.  
  - Larger values indicate the feature has a stronger effect on the target variable.

---

## Insights from the Data

1. **Most important features:**
   - `Study_Hours_Per_Day` (0.1358)
   - `Stress_Level` (0.0950)
   - `Sleep_Hours_Per_Day` (0.0751)

   These are the top predictors affecting the target variable.

2. **Moderate importance:**
   - `study_stress_ratio` and `Study_to_Sleep_Ratio`  
     These derived features still contribute meaningfully.

3. **Least important features:**
   - `Extracurricular_Hours_Per_Day`, `Physical_Activity_Hours_Per_Day`, `Social_Hours_Per_Day`  
     These have minimal influence compared to core academic or sleep/stress-related features.

---

## Notes on Alpha (0.2)

- With `alpha = 0.2`, coefficients are slightly shrunk compared to normal linear regression.
- If you increase alpha further, some coefficients could shrink more, reducing their importance.
- If you decrease alpha toward 0, the model behaves like ordinary linear regression (no shrinkage).

---

## Visualization

The bar plot shows **absolute coefficient values** to compare relative influence:


#SHAP EXPLANATIONS (GLOBAL + INDIVIDUAL)



In [None]:
import shap

# Create explainer for Ridge Regression
explainer = shap.LinearExplainer(best_model, X_train_scaled, feature_perturbation="correlation_dependent")
shap_values = explainer.shap_values(X_test)

# Summary plot (global importance)
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)


# üìà SHAP Feature Importance Summary for Regression Model

This summary is based on a SHAP (SHapley Additive exPlanations) summary plot, which illustrates the **global impact** and direction of each feature on the model's output (prediction). The model is a **Ridge Regression** as indicated by the use of `shap.LinearExplainer`.

## Key Findings

| Rank | Feature Name | Overall Impact (Magnitude) | Direction of Impact |
| :---: | :--- | :--- | :--- |
| **1** | **Study_Hours_Per_Day** | **Highest** | Primarily drives the model output **higher** (positive SHAP values). |
| 2 | **Sleep_Hours_Per_Day** | High | Primarily drives the model output **lower** (negative SHAP values). |
| 3 | **Total_Activity_Hours** | High | Primarily drives the model output **lower** (negative SHAP values). |
| 4 | **Stress_Level** | Moderate | Primarily drives the model output **higher** (positive SHAP values). |
| 5 | **Physical_Activity_Hours_Per_Day** | Moderate | Primarily drives the model output **lower** (negative SHAP values). |

---

## Detailed Feature Analysis

The plot shows the distribution of SHAP values for each feature.

* **X-axis (SHAP value):** Represents the feature's contribution to the model's prediction. A **positive SHAP value** means the feature pushes the prediction *higher*, and a **negative SHAP value** means it pushes the prediction *lower*.
* **Color (Feature value):** Indicates the actual value of the feature, with **Red** representing a *high* feature value and **Blue** representing a *low* feature value.

### ü•á Features with the Strongest Impact

The top three features exhibit the largest magnitude of SHAP values, indicating they have the **greatest influence** on the model's prediction.

* **Study_Hours_Per_Day (Dominant Driver):**
    * This is the **most influential feature**, with SHAP values extending significantly into the positive range (up to $\approx +3$).
    * **High** values (red dots) of `Study_Hours_Per_Day` consistently result in **large positive SHAP values**, meaning *more study hours lead to a higher predicted outcome*.
* **Sleep_Hours_Per_Day:**
    * The second most influential feature, with SHAP values centered around $\approx -1$.
    * **High** values (red dots) are associated with **negative SHAP values**, suggesting that *more sleep hours lead to a lower predicted outcome*.
* **Total_Activity_Hours:**
    * Similar in impact to `Sleep_Hours_Per_Day`, with most SHAP values in the negative range.
    * **High** values (red dots) of activity hours correspond to **negative SHAP values**, implying *a higher total activity level is associated with a lower predicted outcome*.

### ü•à Features with Moderate Impact

These features have SHAP values closer to zero, indicating a smaller, but still noticeable, contribution to the prediction.

* **Stress_Level:**
    * Pushes the prediction primarily **higher** (positive SHAP values).
    * **High** stress levels (red dots) appear to contribute to the **higher** positive SHAP values.
* **Physical_Activity_Hours_Per_Day:**
    * Pushes the prediction primarily **lower** (negative SHAP values), centered around $\approx -0.5$.
    * **High** hours of physical activity (red dots) generally correlate with **lower** predicted outcomes (negative SHAP values).
* **study\_stress\_ratio:**
    * Has a small positive impact on the model output.

### ü•â Features with Minimal Impact

The remaining features have SHAP values very close to zero, suggesting they are the **least important** in driving the model's prediction.

* `Extracurricular_Hours_Per_Day`
* `Social_Hours_Per_Day`
* `Study_to_Sleep_Ratio`

***

## Conclusion

The model is overwhelmingly dominated by **Study_Hours_Per_Day** as the main factor *increasing* the prediction, and **Sleep_Hours_Per_Day** and **Total_Activity_Hours** as the main factors *decreasing* the prediction. The effects of the other features are significantly minor by comparison.

##SHAP Force Plot (INDIVIDUAL EXPLANATION)



In [None]:
# Force plot for a single prediction (local explanation)
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test.iloc[0,:])

# Instance-Specific Contribution Analysis

This section breaks down the specific feature values that influenced the prediction for this single data point:

## üìà Pushing the Prediction Higher (Red)

These feature values were responsible for increasing the predicted outcome above the Base Value:

* **Study_Hours_Per_Day = 6.8**: This is the strongest positive factor, contributing the most to increasing the prediction.
* **Stress_Level = 2**: This moderate stress level is the second strongest positive contributor.
* **study_stress_ratio = 2.267**: Contributed a small push to the positive side.

## üìâ Pushing the Prediction Lower (Blue)

These feature values were responsible for decreasing the predicted outcome below the Base Value:

* **Sleep_Hours_Per_Day = 8.6**: This is the strongest negative factor, and its pull is significant enough to nearly negate the positive effect of the study hours.
* **Total_Activity_Hours = 8.6**: This is the second strongest negative factor, also strongly driving the prediction down.

## ‚úÖ Summary of the Single Prediction

For this particular instance, the large positive impact from 6.8 **Study_Hours_Per_Day** was almost perfectly balanced by the large negative impact from 8.6 **Sleep_Hours_Per_Day** and 8.6 **Total_Activity_Hours**. This balance results in a final predicted outcome ($\text{f(x)}$) that is very close to the overall average Base Value of 3.19.


#SHAP Summary Plot (GLOBAL EXPLANATION)

In [None]:
plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, X_test, plot_type='bar')

# SHAP Model Explanation Summary

This document summarizes the insights from the **SHAP Bar Summary Plot (global feature importance)** and the **SHAP Force Plot (local prediction explanation)** for regression model.

## üåç Global Feature Importance (SHAP Bar Summary Plot)

The bar plot illustrates the **average absolute impact** of each feature on the model's output across the entire dataset. The features are ranked from most important (top) to least important (bottom) based on the length of the bar, which corresponds to $\text{mean}(\vert \text{SHAP value} \vert)$.

### Top 3 Most Important Features

* **Study_Hours_Per_Day**: This is the most influential feature by a significant margin. Its impact on the model's prediction is the strongest overall.
* **Sleep_Hours_Per_Day**: This is the second most important feature, indicating a substantial average impact on the model's output.
* **Total_Activity_Hours**: This ranks third, showing a significant relationship with the predicted outcome, though its average impact is less than sleep hours.

### Features with Moderate Impact

These features have a noticeable, but smaller, average influence on the prediction:

* **Stress_Level**: The fourth most important feature.
* **Physical_Activity_Hours_Per_Day**: Ranks fifth.

### Features with Minimal Impact

The remaining features have the least average impact on the model's output, as their bars are the shortest:

* **study_stress_ratio**
* **Extracurricular_Hours_Per_Day**
* **Social_Hours_Per_Day**
* **Study_to_Sleep_Ratio**


#LIME EXPLAINABILITY (Local Interpretable Model-agnostic Explanations)

1. **Purpose:**
LIME is a tool to explain predictions of any machine learning model (like Random Forest, Ridge, Neural Networks), even if the model is a ‚Äúblack box.‚Äù

**Key idea**: Instead of trying to understand the whole model globally, LIME explains one prediction at a time (local explanation).

2.** How it works (simple analogy)**:

 - Imagine your model is a very complicated machine.

 - You want to know why it made a single prediction.

 - LIME creates a simple, interpretable model (like a linear model) that approximates the black-box model just around that one prediction.

 - It tells you which features pushed the prediction up or down.

3. **Steps LIME follows:**

 1.Take the instance you want to explain (one row of data).

 2.Generate similar data points around it (perturb the features slightly).

 3.Ask the black-box model to predict these new points.

 4.Fit a simple interpretable model (like a linear regression) on these perturbed points.

 5.Use the simple model to see which features were most important for this prediction.

4. **Key Points:**

- LIME is local, not global. It explains one prediction at a time.

- It works with any model: tree-based, neural network, or linear.

- Output is easy to interpret: it shows which features increased or decreased the prediction.


In [None]:
!pip install lime

In [None]:
import lime
import lime.lime_tabular

np.random.seed(42)   # Fix numpy randomness

explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train_scaled),
    feature_names=X_train.columns,
    mode='regression',
    random_state=42     # Fix LIME randomness
)

# Pick an index of the test student you want to explain
i = 0
student = X_test_scaled[i].reshape(1, -1)  # single sample

# Generate explanation
exp = explainer.explain_instance(
    data_row=student[0],
    predict_fn=best_model.predict,
    num_features=len(X_train.columns)
)

# Print feature contributions
exp.as_list()

## Feature Contributions (Local Explanation)

The LIME explainer provides a list of features with their contribution to the prediction for the selected student. Each tuple shows:

* The **feature condition** (value range or threshold)
* The **effect** (how much it pushed the prediction up or down)

### Output:

| Feature Condition                               | Contribution |
| ----------------------------------------------- | ------------ |
| Stress_Level <= -0.48                           | -0.1823      |
| Study_to_Sleep_Ratio <= -0.75                   | 0.0722       |
| -0.81 < Study_Hours_Per_Day <= -0.04            | -0.0645      |
| -0.01 < Sleep_Hours_Per_Day <= 0.89             | -0.0460      |
| Extracurricular_Hours_Per_Day > 0.86            | -0.0398      |
| -0.75 < Total_Activity_Hours <= 0.03            | 0.0163       |
| -0.51 < study_stress_ratio <= 0.00              | -0.0158      |
| Social_Hours_Per_Day <= -0.89                   | 0.0155       |
| -0.09 < Physical_Activity_Hours_Per_Day <= 0.72 | -0.0002      |

## Interpretation:

* **Negative contribution**: Pushes the predicted outcome **lower** compared to the average prediction.
* **Positive contribution**: Pushes the predicted outcome **higher**.

### Key Insights for This Student:

* **Stress_Level** had the largest negative impact (-0.1823), significantly lowering the predicted outcome.
* **Study_to_Sleep_Ratio** contributed positively (0.0722), pushing the prediction upward.
* **Study_Hours_Per_Day** and **Sleep_Hours_Per_Day** also slightly decreased the prediction.
* Other features had smaller effects and were nearly negligible.

‚úÖ **Summary:** LIME provides a **local explanation** showing which features drove this specific student's predicted value higher or lower relative to the model's expected output.


#CLASSIFICATION METRICS

Define Classes from GPA This code performs hyperparameter tuning for a Ridge classification model using GridSearchCV. GridSearchCV then tests many different ridge settings and finds which combination gives the highest accuracy. i want this according to my model


In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV

# Define GPA classes
bins = [0, 2.0, 3.0, 3.3, 3.7, 4.0]          # adjust thresholds as needed
labels = ['Fail', 'General', 'Second_lower','Second_upper','First_class']  # class names
df['GPA_Class'] = pd.cut(df['GPA'], bins=bins, labels=labels)

# Features and target
X = df.drop(['GPA', 'GPA_Class'], axis=1)
y = df['GPA_Class']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# RidgeClassifier GPA Classification Workflow

Setup for predicting GPA classes using RidgeClassifier.

## 1. Define GPA Classes

```python
bins = [0, 2.0, 3.0, 3.3, 3.7, 4.0]          # adjust thresholds as needed
labels = ['Fail', 'General', 'Second_lower','Second_upper','First_class']  # class names
  # Class labels
df['GPA_Class'] = pd.cut(df['GPA'], bins=bins, labels=labels)
```

* **Purpose:** Convert continuous GPA values into discrete classes.
* **Bins:** Define the ranges for each GPA class.
* **Labels:** Name each class for interpretability.

Example:

* GPA 2.0 ‚Üí 'Low'
* GPA 3.0 ‚Üí 'Medium'
* GPA 3.8 ‚Üí 'High'

## 2. Select Features and Target

```python
X = df.drop(['GPA', 'GPA_Class'], axis=1)
y = df['GPA_Class']
```

* **X (features):** All columns except GPA and the newly created GPA_Class.
* **y (target):** The GPA class we want to predict.

## 3. Train-Test Split

```python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
```

* **Purpose:** Split the dataset into training and testing sets.
* **Test size:** 20% of the data is held out for evaluation.
* **Random state:** Ensures reproducibility of the split.

After this step:

* `X_train`, `y_train` ‚Üí used to train the model.
* `X_test`, `y_test` ‚Üí used to evaluate the model's performance.


In [None]:

param_grid = {'alpha': [0.01, 0.1, 0.2, 0.5, 1.0, 5.0]}

ridge = RidgeClassifier()
grid_search = GridSearchCV(ridge, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_ridge = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Best cross-validated accuracy: ", grid_search.best_score_)



### Explanation

* **param_grid:** Defines the regularization strength `alpha` values to test.
* **GridSearchCV:** Performs 5-fold cross-validation to find the alpha with the highest accuracy.
* **best_ridge:** The RidgeClassifier with the optimal alpha.

**Output Interpretation:**

* `Best Parameters: {'alpha': 5.0}` ‚Üí The model performs best with a very small regularization.
* `Best cross-validated accuracy: 0.602` ‚Üí On average across folds, the model correctly classifies ~60.2% of students.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Predict on the test set
y_pred = best_ridge.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Weighted precision, recall, F1-score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Classification Report
report = classification_report(y_test, y_pred)

# Print results
print("Confusion Matrix:\n", cm)
print("\nAccuracy:", accuracy)
print("Precision (weighted):", precision)
print("Recall (weighted):", recall)
print("F1-score (weighted):", f1)
print("\nClassification Report:\n", report)


# RidgeClassifier GPA Classification Results

After training a **RidgeClassifier** on GPA classes, we evaluate the model using several metrics.

---

## 1. Confusion Matrix

```text
Confusion Matrix:
 [[  0   0   0  10]
 [  0 119  18   4]
 [  0  61  39  43]
 [  0  12  20  73]]
```

* **Rows:** True GPA classes
* **Columns:** Predicted GPA classes
* **Interpretation:**

  * The first row (`First_class`) was mostly misclassified (all 10 samples predicted incorrectly).
  * `General` class has high correct predictions (119 out of 141).
  * `Second_lower` has lower accuracy, many misclassified as `General` or `Second_upper`.
  * `Second_upper` shows moderate performance (73 correctly predicted out of 105).

---

## 2. Accuracy

```text
Accuracy: 0.579
```

* **Definition:** Overall fraction of correctly predicted samples.
* **Interpretation:** About 58% of the GPA class predictions are correct.
* **Observation:** Moderate accuracy; the model struggles with minority classes (`First_class`).

---

## 3.Weighted Precision, Recall, and F1-score

```text
Precision (weighted): 0.548
Recall (weighted): 0.579
F1-score (weighted): 0.543
```

* **Precision (weighted):** Average of precision across classes, weighted by support (number of samples per class). Measures correctness of positive predictions.
* **Recall (weighted):** Average of recall across classes, weighted by support. Measures coverage of actual positive samples.
* **F1-score (weighted):** Weighted harmonic mean of precision and recall.

**Interpretation:**

* Precision and F1-score are slightly lower than accuracy, reflecting misclassification of some classes.
* The model performs better for `General` and `Second_upper` classes due to more samples.

---

## Classification Report

```text
Classification Report:
               precision    recall  f1-score   support

 First_class       0.00      0.00      0.00        10
     General       0.62      0.84      0.71       141
Second_lower       0.51      0.27      0.35       143
Second_upper       0.56      0.70      0.62       105

    accuracy                           0.58       399
   macro avg       0.42      0.45      0.42       399
weighted avg       0.55      0.58      0.54       399
```

* **Support:** Number of samples in each class.
* **Precision:** Correct predictions / total predicted for that class.
* **Recall:** Correct predictions / total true samples of that class.
* **F1-score:** Harmonic mean of precision and recall.

**Observations:**

* `First_class` is poorly predicted due to very few samples (class imbalance).
* `General` is the most accurately predicted class.
* `Second_lower` has lower recall, meaning many samples were misclassified.
* `Weighted avg` provides an overall summary considering class imbalance.

---

## Summary

* The RidgeClassifier achieves **moderate performance (accuracy ~58%)**.
* Model struggles with **minority classes** (`First_class`) due to class imbalance.
* Weighted metrics indicate better performance for larger classes (`General`, `Second_upper`).


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Use your actual test labels and predictions
confusion_m = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,5))
sns.heatmap(confusion_m, annot=True, fmt='d', cmap='winter',
            xticklabels=['Fail', 'General', 'Second_lower','Second_upper','First_class'],  # make sure these match your classes
            yticklabels=['Fail', 'General', 'Second_lower','Second_upper','First_class'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Check class distribution in the test set
print(y_test.value_counts())


## Test Set Class Distribution

```text
GPA_Class
Second_lower    143
General         141
Second_upper    105
First_class      10
Fail              0
Name: count, dtype: int64
```

* **Purpose:** Check how the samples are distributed across GPA classes in the test set.
* **Observations:**

  * `Second_lower` and `General` are the most frequent classes.
  * `First_class` is the least represented with only 10 samples.
  * `Fail` class has no samples in the test set.
* **Implication:** The model may struggle to predict minority classes accurately due to imbalance.

##


#ROC Curve (Multiclass)


In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Define classes in the same order as your labels
classes = ['Fail', 'General', 'Second_lower','Second_upper','First_class']

# Binarize the classes
y_test_bin = label_binarize(y_test, classes=classes)
y_pred_bin = label_binarize(y_pred, classes=classes)

# Plot ROC for each class
plt.figure(figsize=(8,6))
for i, class_name in enumerate(classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_bin[:, i])
    roc_auc = auc(fpr, tpr)
    print(f"Class {class_name} - AUC: {roc_auc:.2f}")
    plt.plot(fpr, tpr, label=f"{class_name} (AUC = {roc_auc:.2f})")

# Plot diagonal line for random guessing
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Multi-class GPA Classification")
plt.legend()
plt.show()


## Multi-class ROC and AUC

* ROC curves are plotted for each GPA class to evaluate classifier performance.
* **Classes and AUC values:**

  * `Fail` - AUC: NaN (no samples in test set)
  * `General` - AUC: 0.78
  * `Second_lower` - AUC: 0.56
  * `Second_upper` - AUC: 0.75
  * `First_class` - AUC: 0.50

**Explanation:**

* AUC (Area Under the Curve) measures the classifier's ability to distinguish each class from others.

* Higher AUC values indicate better performance.

* `General` and `Second_upper` are classified relatively well, while `Second_lower` and `First_class` show weak performance.

* `Fail` cannot be evaluated due to no test samples.

* **Visualization:** ROC curves plot True Positive Rate (TPR) vs False Positive Rate (FPR) for each class, showing model performance across thresholds.
