# GPA Data Cleaning, Exploration, and Machine Learning Pipeline
This notebook performs a complete data science pipeline including loading, cleaning, exploring, feature engineering, and building a machine learning model to predict student GPA.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import os
import warnings
import kagglehub

warnings.filterwarnings("ignore")


## Helper Functions

In [None]:
def drop_unnecessary_columns(df):
    """Drop columns that are unnamed or unnecessary for analysis."""
    return df.drop(columns=[col for col in df.columns if 'Unnamed' in col], errors='ignore')

def clean_missing_ids(df, id_column):
    """Remove rows where the ID column is missing."""
    return df[df[id_column].notna()]

def clean_and_validate_gpa(df):
    """Ensure GPA values are within a valid range (0 to 4.0)."""
    return df[df['GPA'].between(0, 4.0)]


## Load Dataset

In [None]:
path = kagglehub.dataset_download("mohammadalazawi/student-gpa")
print("Dataset files located at:", path)
file_path = os.path.join(path, 'GPA.csv')
df = pd.read_csv(file_path)


## Data Cleaning

In [None]:
df = drop_unnecessary_columns(df)
df = clean_missing_ids(df, 'student_id') if 'student_id' in df.columns else df
df = clean_and_validate_gpa(df)
df.reset_index(drop=True, inplace=True)


## Feature Engineering

In [None]:
if 'semester' in df.columns:
    df['semester'] = df['semester'].astype(str)
    df['semester_num'] = df['semester'].str.extract(r'(\d+)').astype(float)

if 'student_id' in df.columns and 'GPA' in df.columns:
    df['cumulative_gpa'] = df.groupby('student_id')['GPA'].expanding().mean().reset_index(level=0, drop=True)


## Exploratory Data Analysis

In [None]:
sns.set(style="whitegrid")

# Correlation Heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# GPA Distribution
sns.histplot(df['GPA'], kde=True, bins=20)
plt.title('Distribution of GPA')
plt.xlabel('GPA')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# GPA by Semester
if 'semester_num' in df.columns:
    sns.lineplot(data=df.groupby('semester_num')['GPA'].mean().reset_index(), x='semester_num', y='GPA')
    plt.title('Average GPA by Semester')
    plt.xlabel('Semester')
    plt.ylabel('Average GPA')
    plt.tight_layout()
    plt.show()


## Machine Learning Model: GPA Prediction

In [None]:
if 'semester_num' in df.columns and 'cumulative_gpa' in df.columns:
    model_df = df.dropna(subset=['GPA', 'semester_num', 'cumulative_gpa'])

    model_df = model_df.groupby('student_id').agg({
        'semester_num': 'max',
        'cumulative_gpa': 'last',
        'GPA': 'last'
    }).rename(columns={'GPA': 'final_GPA'})

    X = model_df.drop(columns=['final_GPA'])
    y = model_df['final_GPA']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipeline = Pipeline([
        ('model', RandomForestRegressor(n_estimators=100, random_state=42))
    ])

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)

    rmse = mean_squared_error(y_test, preds, squared=False)
    r2 = r2_score(y_test, preds)
    print(f"Test RMSE: {rmse:.3f}")
    print(f"Test R^2 Score: {r2:.3f}")

    sns.residplot(x=preds, y=y_test - preds, lowess=True)
    plt.title('Residual Plot')
    plt.xlabel('Predicted GPA')
    plt.ylabel('Residuals')
    plt.tight_layout()
    plt.show()
else:
    print("Required features 'semester_num' and/or 'cumulative_gpa' are missing. Skipping ML model training.")


## Export Cleaned Data

In [None]:
output_path = 'cleaned_data.csv'
df.to_csv(output_path, index=False)
print(f'Data saved to {os.path.abspath(output_path)}')
