# GPA Data Cleaning and Preprocessing
This notebook demonstrates how to load, clean, and explore a student GPA dataset downloaded from Kaggle. The objective is to prepare the dataset for potential use in analysis or machine learning tasks.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

warnings.filterwarnings("ignore")

## Helper Functions

In [None]:
def drop_unnecessary_columns(df, columns_to_drop):
    """Drop specified columns from DataFrame."""
    return df.drop(columns=columns_to_drop, errors='ignore')

def clean_missing_ids(df, id_column):
    """Remove rows with missing values in the ID column."""
    return df[df[id_column].notna()]

## Load Dataset from Kaggle

In [None]:
import kagglehub

# Download the dataset
path = kagglehub.dataset_download("mohammadalazawi/student-gpa")
print("Dataset files located at:", path)

# Load the dataset
file_path = os.path.join(path, 'GPA.csv')
df = pd.read_csv(file_path)
df.head()

## Data Cleaning and Preprocessing

In [None]:
columns_to_drop = ['Unnamed: 0']
df = drop_unnecessary_columns(df, columns_to_drop)
df = clean_missing_ids(df, 'student_id') if 'student_id' in df.columns else df
df.reset_index(drop=True, inplace=True)
df.head()

## Exploratory Data Analysis

In [None]:
# Visualize missing values
missing = df.isnull().sum()
missing = missing[missing > 0]
missing.sort_values().plot(kind='barh', figsize=(8, 4), title='Missing Values per Column')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# GPA distribution plot
if 'GPA' in df.columns:
    sns.set_theme(style="whitegrid")
    sns.histplot(df['GPA'], kde=True, bins=20)
    plt.title('Distribution of GPA')
    plt.xlabel('GPA')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Count of students by semester
if 'semester' in df.columns:
    df['semester'].value_counts().sort_index().plot(kind='bar')
    plt.title('Student Count per Semester')
    plt.xlabel('Semester')
    plt.ylabel('Number of Students')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

## Export Cleaned Data

In [None]:
output_path = 'cleaned_data.csv'
df.to_csv(output_path, index=False)
print(f'Data saved to {os.path.abspath(output_path)}')