# Data-set 

Age: The age of the student.
Gender: The gender of the student (M for Male, F for Female).
Parental_Education: The highest level of education attained by the student's parents.
Family_Income: The family income level.
Previous_Grades: The student's previous academic performance (A, B, or C grades).
Attendance: The percentage of attendance in classes.
Class_Participation: The level of participation in class activities (Low, Medium, or High).
Study_Hours: The average number of study hours per week.
Major: The student's major or field of study.
Uni_Type: The type of University attended (Public or Private).
Financial_Status: The financial status of the student (Low, Medium, or High).
Parental_Involvement: The level of parental involvement in the student's education (Low, Medium, or High).
Educational_Resources: Availability of educational resources at home (Yes or No).
Motivation: The level of motivation towards studies (Low, Medium, or High).
Self_Esteem: The level of self-esteem (Low, Medium, or High).
Stress_Levels: The level of stress experienced by the student (Low, Medium, or High).
School_Environment: Perception of the school environment (Negative, Neutral, or Positive).
Professor_Quality: The quality of professors (Low, Medium, or High).
Class_Size: The size of the class.
Extracurricular_Activities: Participation in extracurricular activities (Yes or No).
Sleep_Patterns: Average hours of sleep per day.
Nutrition: The quality of nutrition (Unhealthy, Balanced, or Healthy).
Physical_Activity: Level of physical activity (Low, Medium, or High).
Screen_Time: Hours spent on screen-based activities per day.
Educational_Tech_Use: Use of educational technology (Yes or No).
Peer_Group: Perception of peer group influence (Negative, Neutral, or Positive).
Bullying: Experience of bullying (Yes or No).
Study_Space: Availability of a dedicated study space at home (Yes or No).
Learning_Style: Preferred learning style (Visual, Auditory, or Kinesthetic).
Tutoring: Participation in tutoring programs (Yes or No).
Mentoring: Availability of mentoring support (Yes or No).
Lack_of_Interest: Level of interest in academics (Low, Medium, or High).
Time_Wasted_on_Social_Media: Time spent on social media platforms.
Sports_Participation: Level of participation in sports activities (Low, Medium, or High).
Grades: The final grades achieved by the student (A, B, or C).

# Imports

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
from imblearn.over_sampling import RandomOverSampler

df = pd.read_csv('StudentData.csv')

: 

# Removing Redundancies

Remove highly correlated columns in order to simplify the decision tree for better interpretability and remove useless redundancies.

In [None]:
#select numerical columns
numerical_df = df.select_dtypes(include=['float64','int64'])

#calculate correlation matrix from numerical columns
correlation_matrix = numerical_df.corr()

#identify columns
high_corr_columns = [column for column in correlation_matrix.columns if any(abs(correlation_matrix[column]) > 0.8) and column!= 'Grades']

print(f"Dropped comlumns: {high_corr_columns}\n")

df = df.drop(columns=high_corr_columns)

#remove spaces and replace with _
df['Parental_Education'] = df['Parental_Education'].astype('category')
df['Parental_Education'] = df['Parental_Education'].cat.rename_categories({
    'High School':'High_School',
    'Some College':'Some_College'
})

#display data frame with first row
print(df.iloc[0])

# Converting Numerical To Categorical Data

In [None]:
df['Family_Income'] = pd.to_numeric(df['Family_Income'], errors='coerce')

def categorize(column, bins, labels):
    return pd.cut(column, bins=bins, labels=labels, include_lowest=True)

#converting numerical data to string categories
df['Family_Income_Category'] = categorize(df['Family_Income'], bins=[30000.0, 39000.0, 48000.0, 57000.0, 66000.0, 75000.0], labels=["LOW", "LOWER_MIDDLE", "MIDDLE", "UPPER_MIDDLE", "HIGH"])
# unused
#  df['Attendance_Category'] = categorize(df['Attendance'], bins=[0.0, 50.0, 75.0, 90.0, 100.0], labels=["LOW", "AVERAGE", "GOOD", "EXCELLENT"])
#  df['Study_Hours_Category'] = categorize(df['Study_Hours'], bins=[5.0, 8.0, 11.0, 14.0, 17.0, 20.0], labels=["VERY_LOW", "LOW", "MODERATE", "HIGH", "VERY_HIGH"])
#  df['Sleep_Patterns_Category'] = categorize(df['Sleep_Patterns'], bins=[4.0, 5.0, 6.0, 7.0, 8.0, 9.0], labels=["NONE_TO_LITTLE", "LITTLE", "MEDIUM", "MUCH", "LOTS"])
#  df['Time_on_Social_Media'] = categorize(df['Time_Wasted_on_Social_Media'], bins=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0], labels=["NONE_TO_LITTLE", "LITTLE", "MEDIUM", "MUCH", "LOTS"])

#remove numerical columns
df = df.drop(columns=['Family_Income'])

#display data frame with first row
print(df.iloc[0])

# Removing/Filling Null Data

If a row has more than 7 null attribures it will be removed, otherwise it will be filled with the most frequent value in the attribute column

In [None]:
#delete row if 7 or mode data is null
df = df.dropna(thresh=7)

#fill missing with most frequent value in column
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

for col in categorical_columns:
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)

# Categorical To String
Data must be converted to string to be read by the c++ code.

In [None]:
#convert columns to string
df = df.astype(str)

# Splitting Train-Test

Train will contain 70% of the data and Test will have the remaining 30%.

In [None]:
#set target values
X = df.drop(columns=['Grades'])
y = df['Grades']

#split the data (70-30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#combine features and target back for train and test
train_data = X_train.copy()
train_data['Grade'] = y_train

test_data = X_test.copy()
test_data['Grades'] = y_test

print(train_data.head().iloc[0])

test_data = test_data.drop(columns=['Grades'])

#save files
os.makedirs('PrepData', exist_ok=True)
train_data.to_csv('PrepData/Stdnt_Train.csv', index=False)
test_data.to_csv('PrepData/Stdnt_Test.csv', index=False)

# Over sampling
In order to achieve the 10s or sequential code running time we need to oversample the data.

With 7k data of training we got about 1 second of run time.

Therefore in order to achieve 10 senconds of run time we will need 700k rows of data.

Oversampled train and test data will be saved in .csv and .txt files.

The .txt file will be used for an easy readability to the .c id3 implementation, the ',' separator will be exchanged for a ' ' in the .txt file.

## Train

In [None]:
#desired sample size
desired_size = 500000
current_size = len(y_train)
#approximate multiplier to reach target balue
multiplier = desired_size // current_size

#random multiplier-based strategy
oversampler = RandomOverSampler(sampling_strategy={label: multiplier * count for label, count in y_train.value_counts().items()})

#apply oversampler to read row count
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

#combine features and target
train_data_oversampled = X_train_oversampled.copy()
train_data_oversampled['Grades'] = y_train_oversampled

#save train to csv
os.makedirs('PrepData', exist_ok=True)
#csv
#train_data_oversampled.to_csv('PrepData/Stdnt_Oversampled_Train.csv', index=False)
#txt with space separators
train_data_oversampled.to_csv('PrepData/Stdnt_Oversampled_Train.txt', sep=' ', header=False, index=False)

#row count
print(f"Oversampled dataset size: {len(train_data_oversampled)} rows")

## Test

In [None]:
#desired sample size
desired_test_size = 500000
current_test_size = len(y_test)
#approximate multiplier to reach target value
test_multiplier = desired_test_size // current_test_size
#random multiplier-based strategy
oversampler_test = RandomOverSampler(sampling_strategy={label: test_multiplier * count for label, count in y_test.value_counts().items()})
#apply oversampler to read row count
X_test_oversampled, y_test_oversampled = oversampler_test.fit_resample(X_test, y_test)

#combine features and target and save on csv
test_data_oversampled = X_test_oversampled.copy()
test_data_oversampled['Grades'] = y_test_oversampled
#drop class attribute
test_data_oversampled = test_data_oversampled.drop(columns=['Grades'])
#save test to csv
os.makedirs('PrepData/Oversampled', exist_ok=True)
#csv
#test_data_oversampled.to_csv('PrepData/Stdnt_Oversampled_Test.csv', index=False)
#txt
test_data_oversampled.to_csv('PrepData/Stdnt_Oversampled_Test.txt', sep=' ', header=False, index=False)

print(f"Oversampled test set size: {len(test_data_oversampled)} rows")