# Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Uploading the dataset and Inspecting

In [None]:

df = pd.read_csv("StudentsPerformance2.csv")
df.head()

Inspecting the first 5 rows of the data

In [None]:
df.describe(include = 'all')

Checking the more info about the data

In [None]:
df.info()

Inspecting the data types

In [None]:
df.columns.unique()

# Cleaning the data

In [None]:
df.isnull().sum()

checking missing values and it appears none are missing

In [None]:
df.duplicated().sum()

Checking for duplicates and it appears none are duplicated

# Visualise Relationships

In [None]:
sns.boxplot(x = 'gender', y = 'math score', data = df)
plt.title("Math Score by Gender")
plt.show()

We can see that males are more consistent and higher scores in math than girls. Also girls have a lot of outliers

In [None]:
sns.boxplot(x='gender', y='writing score', data = df)
plt.title("Writing Score by Gender")
plt.show()

Females scored higher with a lot of outliers while males are consistent but lower scores

In [None]:
sns.boxplot(x='gender', y='reading score', data=df)
plt.title("Reading Score by Gender")
plt.show()

Females scored higher than males in reading

In [None]:
sns.barplot(x = 'parental level of education', y = 'math score', data = df)
plt.xticks(rotation=45)
plt.title("Math Score by Parental Education")
plt.show()

Both master's degree and bachelor's degree parents have children with better scores in math. High school being the lowest

In [None]:
sns.barplot(x = 'parental level of education', y = 'writing score', data = df)
plt.xticks(rotation=45)
plt.title("Writing Score by Parental Education")
plt.show()

We can see the master's here has the highest score

In [None]:
sns.barplot(x = 'parental level of education', y = 'reading score', data = df)
plt.xticks(rotation=45)
plt.title("Reading Score by Parental Education")
plt.show()

Again master's degree scored the highest

In [None]:
numeric_features = ['math score', 'reading score', 'writing score']
df['average score'] = df[numeric_features].mean(axis = 1)
df.head()

Creating a new feature to have better insights on the total score

In [None]:
sns.boxplot(x = 'gender', y = 'average score', data = df)
plt.title("Average score by gender")
plt.show()

The average scores of females is higher than that of males but with outliers while the male score is more consistent

In [None]:
sns.barplot(x = 'parental level of education', y = 'average score', data = df)
plt.xticks(rotation = 45)
plt.title("Average score by Parental Education")
plt.show()

The average score of master's still being the highest followed by bachelor's and high school still being the lowest score

# Training a Regression Model

In [None]:
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course']

target = 'average score'
X = df[categorical_features + numeric_features]
y = df[target]

Splitting the features

In [None]:
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])

pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=1))
])

Creating pipeline for automation

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1)

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_val)
mae = mean_absolute_error(y_val, predictions)
print("MAE:", mae)

Trained the model on the data and used the predictions to calculate mean absolute error. It gave us an error of 0.53 when calculating average scores which is very good considering we didn't normalise the data and the score is from 0 to 100. If we wanted to improve the model, cross-validation will lower the error furthermore

In [None]:
scores = cross_val_score(pipeline, X, y,
                         scoring='neg_mean_absolute_error',
                         cv=5)

# Convert negative scores to positive MAE
mae_scores = -scores

print("MAE scores for each fold:", mae_scores)
print("Average MAE:", mae_scores.mean())

The mean absolute error decreased by 0.03 after we used cross-validation so it didn't matter that much and because the error is so minimal, normalising the data won't affect that much as well