# 🎓 GPA Data Cleaning and Preprocessing
This notebook handles cleaning and exploration of student GPA data, originally sourced from Excel files. The goal is to create a clean dataset suitable for further analysis or machine learning.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## 🧩 Helper Functions

In [None]:
# Define your reusable functions here
def load_excel_file(path):
    return pd.read_excel(path)

def drop_unnecessary_columns(df, columns_to_drop):
    return df.drop(columns=columns_to_drop, errors='ignore')

def clean_missing_ids(df, id_column):
    return df[df[id_column].notna()]

## 📥 Load Raw Data

In [None]:
# Load your raw Excel file
file_path = 'your_excel_file.xlsx'  # Update with your actual file path
df = load_excel_file(file_path)
df.head()

## 🧼 Data Cleaning and Preprocessing

In [None]:
# Drop unnecessary columns and rows with missing student IDs
columns_to_drop = ['Unnamed: 0', 'ExtraColumn']  # Add/remove as needed
df = drop_unnecessary_columns(df, columns_to_drop)
df = clean_missing_ids(df, 'student_id')
df.reset_index(drop=True, inplace=True)
df.head()

## 📊 Exploratory Data Analysis

In [None]:
# Missing values bar plot
missing = df.isnull().sum()
missing = missing[missing > 0]
missing.sort_values().plot(kind='barh', figsize=(8, 4), title='Missing Values per Column')
plt.tight_layout()
plt.show()

In [None]:
# GPA distribution
if 'GPA' in df.columns:
    sns.histplot(df['GPA'], kde=True, bins=20)
    plt.title('GPA Distribution')
    plt.xlabel('GPA')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()

In [None]:
# Count by semester (example)
if 'semester' in df.columns:
    df['semester'].value_counts().sort_index().plot(kind='bar')
    plt.title('Student Count by Semester')
    plt.xlabel('Semester')
    plt.ylabel('Number of Students')
    plt.tight_layout()
    plt.show()

## 💾 Export Cleaned Data

In [None]:
# Save the cleaned dataframe
output_path = 'cleaned_data.xlsx'
df.to_excel(output_path, index=False)
print(f'Data saved to {output_path}')

## 🚀 Next Steps
- Perform deeper analysis (e.g., GPA trends, course difficulty)
- Engineer new features
- Apply ML models to predict student success