# Load and clean data

**We Import alle the Standard libraries we think we are going to need**

In [None]:
# Note: this import can change from class to class 

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score


# Style til grafer
sns.set(style="whitegrid")

## Loading and cleaning the real dataset for student performance

We start getting the column names and then move on to cleaning the dataset. We do that by first skipping the metadata and afterwards check if there are any dublicated values. in our case no duplicates values are found so we move on.

In [None]:
import pandas as pd
from io import StringIO

# Define the column we want 
column_names = [
    "StudentID", "Age", "Gender", "Ethnicity", "ParentalEducation", "StudyTimeWeekly",
    "Absences", "Tutoring", "ParentalSupport", "Extracurricular", "Sports", "Music",
    "Volunteering", "GPA", "GradeClass"
]

# Open the file and read only the data portion (skip comments and metadata)
with open("../Data/RawData/real_data_openML", "r") as f:
    data_lines = [
        line.strip() for line in f
        if line.strip() and not line.startswith(("%", "@"))
    ]

# Convert the filtered lines into a CSV-like format and load into DataFrame
realData = pd.read_csv(StringIO("\n".join(data_lines)), names=column_names)

print(realData.isnull().sum())
print("Duplicate values: ", realData.duplicated().sum())

realData.columns




#----------------------------------------------------------#
#----------------------------------------------------------#
#----------------------------------------------------------#




# 1. Fjern duplikater
realData = realData.drop_duplicates()

# 2. Drop rows med manglende værdier (eller evt. imputér, hvis få mangler)
realData = realData.dropna()

# 3. Fjern unødvendige kolonner
realData = realData.drop(columns=["StudentID"])



# 5. Konverter kolonner til numerisk (i tilfælde af forkerte typer)
num_cols = ["Age", "StudyTimeWeekly", "Absences", "GPA"]
for col in num_cols:
    realData[col] = pd.to_numeric(realData[col], errors="coerce")



# 6. Fjern ekstreme værdier i StudyTimeWeekly (fx > 100 timer/uge)
realData = realData[realData["StudyTimeWeekly"] <= 100]



# 7. Konverter ja/nej-variabler til 0/1
# Ensure binary/ordinal columns are numeric integers
binary_cols = ["Tutoring", "ParentalSupport", "Extracurricular", "Sports", "Music", "Volunteering"]
for col in binary_cols:
    realData[col] = pd.to_numeric(realData[col], errors="coerce").astype("Int64")





#----------------------------------------------------------#
#----------------------------------------------------------#
#----------------------------------------------------------#




# ---------- Tjek status ----------
print(realData.isnull().sum())
print("Duplicate values: ", realData.duplicated().sum())
print("Shape efter cleaning:", realData.shape)
print("Datatyper:\n", realData.dtypes)

realData.head()

StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64
Duplicate values:  0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64
Duplicate values:  0
Shape efter cleaning: (2392, 14)
Datatyper:
 Age                  float64
Gender               float64
Ethnicity            float64
ParentalEducation    float64
StudyTimeWeekly      float64
Absences             float64
Tutoring               Int64
ParentalSupport  

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,17.0,1.0,0.0,2.0,19.833723,7.0,1,2,0,0,1,0,2.929196,2.0
1,18.0,0.0,0.0,1.0,15.408756,0.0,0,1,0,0,0,0,3.042915,1.0
2,15.0,0.0,2.0,3.0,4.21057,26.0,0,2,0,0,0,0,0.112602,4.0
3,17.0,1.0,0.0,3.0,10.028829,14.0,0,3,1,0,0,0,2.054218,3.0
4,17.0,1.0,0.0,2.0,4.672495,17.0,1,3,0,0,0,0,1.288061,4.0


In [25]:
# safe the new cleaned dataset 

# Lav en kopi af det cleanede dataset
realData_cleaned = realData.copy()

# Opret mappe hvis den ikke allerede findes
output_dir = "../Data/CleanedData"
os.makedirs(output_dir, exist_ok=True)

# Gem CSV-fil
output_path = os.path.join(output_dir, "real_data_cleaned.csv")
realData_cleaned.to_csv(output_path, index=False)

print(f"Dataset gemt som: {output_path}")


Dataset gemt som: ../Data/CleanedData\real_data_cleaned.csv


## Loading and cleaning the simulated dataset for student performance

As with whe other dataset we check if there are any duplicated values.

In [26]:
# Load data 
simulatedData = pd.read_csv('../Data/RawData/simulated_data_kaggle.csv')

# ----------- Data cleaning for simulatedData -----------

# 1. Fjern duplikater
simulatedData = simulatedData.drop_duplicates()

# 2. Fjern rækker med manglende værdier
simulatedData = simulatedData.dropna()

# 3. Fjern unødvendige kolonner, hvis de findes (fx 'StudentID')
if "StudentID" in simulatedData.columns:
    simulatedData = simulatedData.drop(columns=["StudentID"])

# 4. Standardisér tekstfelter (ens formatering)
for col in simulatedData.select_dtypes(include='object').columns:
    simulatedData[col] = simulatedData[col].str.strip().str.lower()

# 5. Konverter kolonner til numerisk
num_cols = ["Age", "StudyTimeWeekly", "Absences", "GPA"]
for col in num_cols:
    if col in simulatedData.columns:
        simulatedData[col] = pd.to_numeric(simulatedData[col], errors="coerce")

# 6. Fjern outliers i StudyTimeWeekly (>100 timer/uge)
if "StudyTimeWeekly" in simulatedData.columns:
    simulatedData = simulatedData[simulatedData["StudyTimeWeekly"] <= 100]

# 7. Konverter ja/nej-kolonner til 0/1
binary_cols = ["Tutoring", "ParentalSupport", "Extracurricular", "Sports", "Music", "Volunteering"]
for col in binary_cols:
    if col in simulatedData.columns:
        simulatedData[col] = simulatedData[col].map({"yes": 1, "no": 0})

# 8. Reset index
simulatedData = simulatedData.reset_index(drop=True)

# ----------- Status -----------

print(simulatedData.isnull().sum())
print("Duplicate values: ", simulatedData.duplicated().sum())
print("Shape efter cleaning:", simulatedData.shape)
print("Datatyper:\n", simulatedData.dtypes)


student_id                       0
age                              0
gender                           0
study_hours_per_day              0
social_media_hours               0
netflix_hours                    0
part_time_job                    0
attendance_percentage            0
sleep_hours                      0
diet_quality                     0
exercise_frequency               0
parental_education_level         0
internet_quality                 0
mental_health_rating             0
extracurricular_participation    0
exam_score                       0
dtype: int64
Duplicate values:  0
Shape efter cleaning: (909, 16)
Datatyper:
 student_id                        object
age                                int64
gender                            object
study_hours_per_day              float64
social_media_hours               float64
netflix_hours                    float64
part_time_job                     object
attendance_percentage            float64
sleep_hours                      f

In [27]:


# Lav en kopi af det cleanede realData
realData_cleaned = realData.copy()

# Lav en kopi af det cleanede simulatedData
simulatedData_cleaned = simulatedData.copy()

# Opret mappen hvis den ikke findes
output_dir = "../Data/CleanedData"
os.makedirs(output_dir, exist_ok=True)

# Gem realData_cleaned
real_path = os.path.join(output_dir, "real_data_cleaned.csv")
realData_cleaned.to_csv(real_path, index=False)

# Gem simulatedData_cleaned
sim_path = os.path.join(output_dir, "simulated_data_cleaned.csv")
simulatedData_cleaned.to_csv(sim_path, index=False)

print(f"realData gemt som: {real_path}")
print(f"simulatedData gemt som: {sim_path}")


realData gemt som: ../Data/CleanedData\real_data_cleaned.csv
simulatedData gemt som: ../Data/CleanedData\simulated_data_cleaned.csv
