In [4]:
import pandas as pd
import numpy as np
import os
from google.colab import files

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('student_performance.csv')  # Make sure the file is uploaded to Colab
print("Dataset loaded successfully.")

# Display basic info about the dataset
print("\nInitial Dataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Handle missing values (if any)
print("\nHandling missing values...")
df.dropna(inplace=True)  # Drop rows with missing values
print("Missing values handled.")

# Check for duplicates
print("\nDuplicate Rows:")
print(df.duplicated().sum())

# Remove duplicates (if any)
print("\nRemoving duplicates...")
df.drop_duplicates(inplace=True)
print("Duplicates removed.")

# Standardize column names (replace spaces with underscores and convert to lowercase)
print("\nStandardizing column names...")
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("Column names standardized:")
print(df.columns)

# Feature Engineering: Create a new feature for average score
print("\nCreating 'average_score' feature...")
df['average_score'] = (df['math_score'] + df['reading_score'] + df['writing_score']) / 3
print("'average_score' feature created.")

# Outlier Detection: Remove outliers using the IQR method
print("\nRemoving outliers...")
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

df = remove_outliers(df, 'math_score')
df = remove_outliers(df, 'reading_score')
df = remove_outliers(df, 'writing_score')
print("Outliers removed.")

# Create the 'data' folder if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

# Save the cleaned dataset
print("\nSaving cleaned dataset...")
df.to_csv('data/student_performance_cleaned_unique.csv', index=False)
print("Cleaned dataset saved to 'data/student_performance_cleaned_unique.csv'.")

# Download the CSV file to your local machine
print("\nDownloading cleaned dataset...")
files.download('data/student_performance_cleaned_unique.csv')

Loading dataset...
Dataset loaded successfully.

Initial Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 986 entries, 0 to 985
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       986 non-null    object 
 1   race/ethnicity               986 non-null    object 
 2   parental_level_of_education  986 non-null    object 
 3   lunch                        986 non-null    object 
 4   test_preparation_course      986 non-null    object 
 5   math_score                   986 non-null    int64  
 6   reading_score                986 non-null    int64  
 7   writing_score                986 non-null    int64  
 8   average_score                986 non-null    float64
dtypes: float64(1), int64(3), object(5)
memory usage: 69.5+ KB
None

First 5 rows:
   gender race/ethnicity parental_level_of_education         lunch  \
0  female        group B          

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>