# Load and clean data

In [22]:
# Note: this import can change from class to class in MP3 

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# Style til grafer
sns.set(style="whitegrid")

## Loading and cleaning the real dataset for student performance

In [None]:
import pandas as pd
from io import StringIO

# Define the column names as per the metadata
column_names = [
    "StudentID", "Age", "Gender", "Ethnicity", "ParentalEducation", "StudyTimeWeekly",
    "Absences", "Tutoring", "ParentalSupport", "Extracurricular", "Sports", "Music",
    "Volunteering", "GPA", "GradeClass"
]

# Open the file and read only the data portion (skip comments and metadata)
with open("../Data/student_performance_data", "r") as f:
    data_lines = [
        line.strip() for line in f
        if line.strip() and not line.startswith(("%", "@"))
    ]

# Convert the filtered lines into a CSV-like format and load into DataFrame
realData = pd.read_csv(StringIO("\n".join(data_lines)), names=column_names)

print(realData.isnull().sum())
print("Duplicate values: ", realData.duplicated().sum())

realData.columns


StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64
Duplicate values:  0


Index(['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
       'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
       'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA',
       'GradeClass'],
      dtype='object')

## Loading and cleaning the simulated dataset for student performance

In [24]:
# Load data 
simulatedData = pd.read_csv('../Data/student_habits_performance.csv')


# Clean dataset (there are no duplicates or null values, so no action needed for deleting those)
print(simulatedData.isnull().sum())
print("Duplicate values: ", simulatedData.duplicated().sum())

realData.columns

student_id                        0
age                               0
gender                            0
study_hours_per_day               0
social_media_hours                0
netflix_hours                     0
part_time_job                     0
attendance_percentage             0
sleep_hours                       0
diet_quality                      0
exercise_frequency                0
parental_education_level         91
internet_quality                  0
mental_health_rating              0
extracurricular_participation     0
exam_score                        0
dtype: int64
Duplicate values:  0


Index(['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
       'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
       'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA',
       'GradeClass'],
      dtype='object')