##**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

##**Load the Dataset**



In [None]:
df = pd.read_csv('/content/StudentsPerformance.csv')

NameError: name 'pd' is not defined

In [None]:
df.head()

Exploratory Data Analysis

In [None]:
print(f"Shape Of The Dataset: {df.shape}")

In [None]:
#Checking Data Types and the Missing Value
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.nunique()

In [None]:
#Detect Outliers Using Z-Score Method
z_scores_math = np.abs((df['math score'] - df['math score'].mean()) / df['math score'].std())
z_scores_reading = np.abs((df['reading score'] - df['reading score'].mean()) / df['reading score'].std())
z_scores_writing = np.abs((df['writing score'] - df['writing score'].mean()) / df['writing score'].std())

# Define threshold for outliers
threshold = 3

# Identify outliers
outliers_math = df[z_scores_math > threshold]
outliers_reading = df[z_scores_reading > threshold]
outliers_writing = df[z_scores_writing > threshold]

# Handle outliers (by removing it)
df = df[(z_scores_math <= threshold) & (z_scores_reading <= threshold) & (z_scores_writing <= threshold)]

In [None]:
df.describe()

In [None]:
df['Percentage'] = round((df['reading score'] + df['writing score'] + df['math score']) / 3, 2)

##**Data Visualization**

In [None]:
gender = df['gender'].value_counts()
fig = px.pie(values = gender.values,
             names = gender.index,
             hole = 0.8)

fig.update_traces(textinfo = 'label+percent', textfont_size=16)

fig.update_layout(
    font = dict(size = 20, family = "arial"),
    annotations = [dict(text = 'Gender', x = 0.5, y = 0.5, font_size = 20, showarrow=False)]
)
fig.show()

In [None]:
ethnicity = df['race/ethnicity'].value_counts()
fig = px.pie(values = ethnicity.values,
             names = ethnicity.index,
             hole = 0.8)

fig.update_traces(textinfo = 'label+percent', textfont_size=16)

fig.update_layout(
    font = dict(size = 20, family = "arial"),
    annotations = [dict(text = 'Race distribution', x = 0.5, y = 0.5, font_size = 20, showarrow=False)]
)
fig.show()

In [None]:
test_preparation_course= df['test preparation course'].value_counts()
fig = px.pie(values = test_preparation_course.values,
             names = test_preparation_course.index,
             hole = 0.8)

fig.update_traces(textinfo = 'label+percent', textfont_size=14)

fig.update_layout(
    font = dict(size = 15, family = "arial"),
    annotations = [dict(text = "test preparation course", x = 0.5, y = 0.5, font_size = 20, showarrow=False)]
)
fig.show()

In [None]:
parental_education = df['parental level of education'].value_counts()
fig = px.pie(values = parental_education.values,
             names = parental_education.index,
             hole = 0.8)

fig.update_traces(textinfo = 'label+percent', textfont_size=14)

fig.update_layout(
    font = dict(size = 15, family = "arial"),
    annotations = [dict(text = "Parent's Education", x = 0.5, y = 0.5, font_size = 20, showarrow=False)]
)
fig.show()

In [None]:
#Distribution of the Scores in Math, Reading, and Writing
# Create subplots for each score
plt.figure(figsize=(12, 6))

# Histogram for Math Score
plt.subplot(1, 3, 1)
plt.hist(df['math score'], color='skyblue', edgecolor='black')
plt.title('Math Score Distribution')
plt.xlabel('Math Score')
plt.ylabel('Frequency')

# Histogram for Reading Score
plt.subplot(1, 3, 2)
plt.hist(df['reading score'], color='salmon', edgecolor='black')
plt.title('Reading Score Distribution')
plt.xlabel('Reading Score')
plt.ylabel('Frequency')

# Histogram for Writing Score
plt.subplot(1, 3, 3)
plt.hist(df['writing score'], color='lightgreen', edgecolor='black')
plt.title('Writing Score Distribution')
plt.xlabel('Writing Score')
plt.ylabel('Frequency')

plt.show()

In [None]:
fig, ax = plt.subplots(2, 2, figsize = (25, 12))

sns.kdeplot(data = df, x = "Percentage", hue = "gender",palette = 'inferno', cumulative = True, common_norm = False, ax = ax[0, 0])
sns.kdeplot(data = df, x = "Percentage", hue = "race/ethnicity",palette = 'inferno',cumulative = True, common_norm = False, ax = ax[0, 1])
sns.kdeplot(data = df, x = "Percentage", hue = "parental level of education",palette = 'inferno', cumulative = True, common_norm = False, ax = ax[1, 0])
sns.kdeplot(data = df, x = "Percentage", hue = "test preparation course",palette = 'inferno', cumulative = True, common_norm = False, ax = ax[1, 1])

plt.show()

- Females have higher percentage than males
- Students whose parents holds a master's degree have a higher percentage than others
- Students who completed their course have higher percentage as compared to those who didn't.


- Students whose parents never went to college seems to have the lowest percentage
- Students whose parents have a master's degree performed the best followed by parents having a bachelor's degree

In [None]:
fig , ax = plt.subplots(1, 3, figsize = (15, 5))
ax1 = sns.histplot(x = df['writing score'], hue = df['gender'] ,palette= 'plasma', ax= ax[0])
ax1 = sns.histplot(x = df['reading score'], hue = df['gender'] ,palette= 'plasma', ax= ax[1])
ax1 = sns.histplot(x = df['math score'],    hue = df['gender'] ,palette= 'plasma', ax= ax[2])

In [None]:
# figure out the performance of each field for male and female.
gender_mean_score = df.groupby('gender')[['math score', 'reading score', 'writing score']].mean().round(2).transpose()
print(gender_mean_score)

fig = go.Figure(data = [
    go.Table(
        header = {
            'values': ['', '<b>Female</b>', '<b>Male</b>'],
            'line_color' : 'navy',
            'fill_color' : 'darkcyan',
            'align' : 'center',
            'font_size': 20
         },
        cells = {
            'values' : [gender_mean_score.index, gender_mean_score['female'], gender_mean_score['male']],
            'line_color' : 'navy',
            'fill_color' : 'azure',
            'align' : 'center',
            'height' : 40,
            'font_size': 20
        }
    )
])
fig.update_layout(width = 600, height = 400)
fig.show()

- Females tend to do better than males in both reading and writing
- Males perform better in Maths

In [None]:
df_corr = df.iloc[:,5:8]
df_corr=df_corr.corr()
df_corr

- Almost all of these scores are highly correlated with each other
- Maths score seems to be the least correlated among these, therefore we will try to predict maths score during modelling

In [None]:
def Grade(percentage):
    if percentage >= 95: return "A+"
    if percentage > 81 : return "A"
    if percentage > 71 : return "B"
    if percentage > 61 : return "C"
    if percentage > 51 : return "D"
    if percentage > 41 : return "E"
    else: return "F"

df["Grade"] = df['Percentage'].apply(lambda x: Grade(x))

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = 'Grade', y = 'Percentage', data= df, hue= 'gender', palette= 'crest');

##**Transformation pipeline**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [None]:
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, grades_ordering = ['F', 'E', 'D', 'C', 'B', 'A', 'A+'],
                 ethnicity_ordering = ['group A', 'group B', 'group C', 'group D', 'group E'],
                 parents_education_ordering = ['high school', 'some high school', 'some college', "associate's degree", "bachelor's degree", "master's degree"]):

        self.grades_ordering = grades_ordering
        self.ethnicity_ordering = ethnicity_ordering
        self.parents_education_ordering = parents_education_ordering

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["Grade"] = X['Grade'].apply(lambda x: self.grades_ordering.index(x))
        X["parental level of education"] = X['parental level of education'].apply(lambda x: self.parents_education_ordering.index(x))
        X["race/ethnicity"] = X['race/ethnicity'].apply(lambda x: self.ethnicity_ordering.index(x))
        return X


In [None]:
num_cols = ['reading score', 'writing score', 'Percentage']
cat_cols = ['gender', 'lunch', 'test preparation course']
ordinal_cols = ['Grade', 'race/ethnicity', 'parental level of education']

pipeline = ColumnTransformer([
    ('std_scaler', StandardScaler(), num_cols),
    ('ord_encode', CustomOrdinalEncoder(), ordinal_cols),
    ('label_encode', OneHotEncoder(), cat_cols)], remainder= 'passthrough')

In [None]:
df['Percentage'] = round((df['reading score'] + df['writing score']) / 2, 2)
df["Grade"] = df['Percentage'].apply(lambda x: Grade(x))
x = df.drop('math score', axis = 1)
y = df['math score']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)
X_train_prepared = pipeline.fit_transform(X_train)
X_test_prepared = pipeline.transform(X_test)

In [None]:
df.head()

##**SVM**

In [None]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
#SVM
# kernel = linear
model_linear = SVC(kernel="linear")
model_linear.fit(X_train_prepared, y_train)
pred_test_linear = model_linear.predict(X_test_prepared)
pred_train_linear = model_linear.predict(X_train_prepared)

# kernel = sigmoid
model_sigmoid = SVC(kernel="sigmoid")
model_sigmoid.fit(X_train_prepared, y_train)
pred_test_sigmoid = model_sigmoid.predict(X_test_prepared)
pred_train_sigmoid = model_sigmoid.predict(X_train_prepared)



data = {"kernel":pd.Series(["linear","sigmoid"]),"Test Accuracy":pd.Series([accuracy_score(y_test, pred_test_linear),accuracy_score(y_test, pred_test_sigmoid)])}
table_acc=pd.DataFrame(data)
table_acc

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
kernels = ["linear", "sigmoid"]
test_accuracy = [accuracy_score(y_test, pred_test_linear), accuracy_score(y_test, pred_test_sigmoid)]

# Set the width of the bars
bar_width = 0.35

# Set position of bar on X axis
r1 = np.arange(len(kernels))

# Create the plot
plt.figure(figsize=(8, 6))
plt.bar(r1, test_accuracy, color=['blue', 'green'], width=bar_width, edgecolor='grey', label='Test Accuracy')

# Add xticks on the middle of the group bars
plt.xlabel('Kernel', fontweight='bold')
plt.ylabel('Accuracy', fontweight='bold')
plt.xticks([r for r in range(len(kernels))], kernels)

# Add labels
for i, v in enumerate(test_accuracy):
    plt.text(i, v + 0.01, str(round(v, 2)), ha='center', va='bottom')

# Add title and legend
plt.title('Test Accuracy Comparison of SVM Models with Different Kernels')
plt.legend()

# Show plot
plt.tight_layout()
plt.show()


##**Random Forest**

In [None]:
!pip install optun

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Random Forest
model = RandomForestRegressor(random_state = 42)
model.fit(X_train_prepared, y_train)

kfold = KFold(n_splits= 5)
scores =  - cross_val_score(model, X_train_prepared, y_train, scoring="neg_mean_squared_error", cv=kfold)
rmse_scores = np.sqrt(scores)

print(f"Mean: {rmse_scores.mean()}", )
print(f"Standard deviation: {rmse_scores.std()}")

In [None]:
y_pred = model.predict(X_test_prepared)
rmse = mean_squared_error(y_test, y_pred, squared= False)
r_square = r2_score(y_test, y_pred)

In [None]:
print(f'Root Mean Squared error: {round(rmse, 3)}')
print(f'R-square: {round(r_square, 3)}')

In [None]:
sns.set_context('notebook', font_scale= 1.3)
plt.figure(figsize= (10, 6))
sns.scatterplot(x= y_test, y= y_pred, color= '#005b96')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.show()

##**K mean**

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

In [None]:
labelencoder = LabelEncoder()
train_df = df.copy()
train_df["parental level of education"] = labelencoder.fit_transform(train_df["parental level of education"])
train_df["test preparation course"] = labelencoder.fit_transform(train_df["test preparation course"])
train_df["lunch"] = labelencoder.fit_transform(train_df["lunch"])
train_df.head()

In [None]:
kmeans_dis = list()
for idx in range(2, 25):
    kmeans = KKMeansMeans(init = "k-means++", n_clusters = idx, n_init = 20)
    kmeans.fit_transform(train_df.iloc[:, 2:-1])
    kmeans_dis.append(kmeans.inertia_)
plt.plot(list(range(2,25)), kmeans_dis, marker = "o")
plt.xlabel("Number of clusters")
plt.ylabel("Summation of distance")
plt.show()

We choose 8 as elbow point, and then classify all data.

In [None]:
#KMeans
kmeans = KMeans(init = "k-means++", n_clusters = 8)
kmeans.fit_transform(train_df.iloc[:, 2:-1])
kmeans_label = kmeans.labels_
df["clusters"] = kmeans_label
df.head(10)

In [None]:
class_df = df.groupby("clusters")[df.columns[5:8]].mean()
class_df

In [None]:
ind = np.arange(8)
width = 0.35
fig, ax = plt.subplots()
rects1 = ax.bar(ind - width/2, class_df['math score'], width, label='Math')
rects2 = ax.bar(ind, class_df['reading score'], width, label='Reading')
rects3 = ax.bar(ind + width/2, class_df['writing score'], width, label='Writing')

ax.set_xlabel('Classiffication')
ax.set_ylabel('Scores')
ax.set_xticks(ind)
ax.legend()
plt.show()

it's obviously that all subject of cluster has the same trend, so we choose the average of all sbjects to rank the clusters

In [None]:
class_df["total_ave_score"] = (class_df['math score'] + class_df['reading score'] + class_df['writing score'])/3
rank = class_df["total_ave_score"].sort_values(ascending = False)
rank.index
rank

For top5 rank, the average score all passed, Rank0 is the best cluster, Rank1 is second one and so on.

From now on, we can find out the correlation between the performance of students and features. Let's plot pie chart to see whether parents education level can affect the performance or not.

In [None]:
def plot_pie_chart(column):
    fig, ax = plt.subplots(figsize=(20,16))
    color = ["orange","lightblue","green","yellow","red","pink","brown","gray"]
    for idx in range(8):
        plt.subplot(3, 3, idx+1)
        num = "class"+ str(idx)
        num = df[df["clusters"]==rank.index[idx]]
        percentage_of_parent_edu = num[column].value_counts()
        percentage_of_parent_edu.sort_index()
        label = percentage_of_parent_edu.index
        value = percentage_of_parent_edu.values
        plt.pie(value, labels = label, autopct = "%1.1f%%",
                startangle=90, radius = 4, colors = color[:len(label)])
        plt.axis("equal")
        plt.title("Rank "+str(idx))
    plt.show()
plot_pie_chart("parental level of education")

Let's define the high degree of education. Parents having bachelor or master degree are high-level educated. So we focus on these two terms.

As pie chart were shown above, we can easily understand the ratio of high-degree education. For the rank0, its ratio is around 32%. In addition, there are no differences between rank1 to rank3, and the ratio are around 15~17%. Finally, the ratio is only 8% in rank7.

We calculated the average score of each rank before, so we can say that parent's education affect the score but not obviously, because there are still 70%~80% parents without high education degree.

In [None]:
def plot_bar_chart(column):
    fig, ax = plt.subplots(figsize=(8,6))
    index_dict = dict()
    width = 0.35
    ind = np.arange(8)
    for idx in range(8):
        num = "class"+ str(idx)
        num = df[df["clusters"]==rank.index[idx]]
        percentage_of_column = num[column].value_counts()
        percentage_of_column = percentage_of_column.sort_index()
        for key in percentage_of_column.index:
            if key not in index_dict.keys():
                index_dict[key] = []
                index_dict[key].append(percentage_of_column[key]) #/percentage_of_column.values.sum())
            else:
                index_dict[key].append(percentage_of_column[key]) #/percentage_of_column.values.sum())

    percentage_of_column = df[df["clusters"]==rank.index[4]][column].value_counts().sort_index()
    for i in range(len(percentage_of_column.index)):
        rects = ax.bar(ind - width/(i+1),
                       index_dict[percentage_of_column.index[i]],
                       width, label=percentage_of_column.index[i])

    ax.set_xlabel('Rank')
    ax.set_ylabel('# of students')
    ax.set_title("Percentage of " + column)
    ax.set_xticks(ind)
    ax.legend()
    plt.show()

plot_bar_chart("test preparation course")

Over 50% of students in rank0 completed the test preparation course, and normally there is about 70~80% students in rank7 hadn't finished course. It is say that preparation course can help students get better score.

In [None]:
plot_bar_chart("lunch")

Students who had lunch before test got better score. That is, it's hard to get good performance without eating.



In [None]:
plot_bar_chart("gender")

It's hard to say that the male is better than female.

There are few conclusions below:

Parents' education level may affect the performance of students, but not the important one.
Finishing preparation course is benefitial.
Having lunch is important to students, and it is also the most significant one.
Gender has no correlation with the score.
In summary, if students want to have good performance, they should have enough nutrient and make effort to prepare the test.

How effective is the test preparation course?
Its very effective because the students who took the course got higher test scores on all the tests unlike to those who didnt took the course

Which major factors contribute to test outcomes?
The highest factor that contribute to test outcomes is the test preparation course and followed by the lunch and lastly is the gender, though race/ethnicity has quite influence on the test scores, its very little.

What would be the best way to improve student scores on each test?
To take the test preparation course. Even theres other factors that have a major effect on the test outcomes, its not a choice that students could change, examples: lunch, gender and race/ethnicity

##**Test Accuracy**

In [None]:
# # KMeans
# y_pred_1 = kmeans.predict(X_test)
# accuracy_1 = accuracy_score(y_test, y_pred_1)



# # Plot the metrics for Model 1
# metrics = ['Accuracy']
# values = [accuracy_1]

# plt.bar(metrics, values)
# plt.xlabel('Metrics')
# plt.ylabel('Values')
# plt.title('Model 1 - Kmean')
# plt.show()

# Model 2: Decision Tree Classifier
y_pred_2 = model.predict(X_test_prepared)
accuracy_2 = accuracy_score(y_test, y_pred_2)


# Plot the metrics for Model 2
metrics = ['Accuracy']
values = [accuracy_2]

plt.bar(metrics, values)
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Model 2 - RandomForestRegressor')
plt.show()

# Model 3: Decision Tree Classifier
y_pred_2 = RandomForestRegressor.predict(X_test)
accuracy_2 = accuracy_score(y_test, y_pred_2)


# Plot the metrics for Model 3
metrics = ['Accuracy']
values = [accuracy_2]

plt.bar(metrics, values)
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Model 3 - ')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming you have the `table_acc` DataFrame for SVM test accuracies

# Extract data
svm_kernels = table_acc["kernel"].tolist()
svm_accuracy = table_acc["Test Accuracy"].tolist()
mean_rf_accuracy = rmse*100  # Replace with your calculated mean accuracy (or RMSE) for Random Forest
std_dev_rf_accuracy = rmse_scores.std()  # Replace with your calculated standard deviation

# Create the chart
plt.figure(figsize=(8, 6))
plt.bar(svm_kernels, svm_accuracy, color=['blue', 'green'], label='SVM')  # Adjust colors and label
plt.axhline(y=mean_rf_accuracy, color='red', linestyle='--', label='Random Forest (Mean)')  # Adjust color and label
plt.errorbar(x=['Random Forest'], y=[mean_rf_accuracy], yerr=[std_dev_rf_accuracy], fmt='none', ecolor='red', capsize=7, label='Random Forest (Std. Dev.)')  # Adjust label
plt.xlabel("Model")
plt.ylabel("Test Accuracy (or RMSE)")  # Adjust label based on metric
plt.title("Comparison of Test Accuracy/RMSE (Single Chart)")
plt.xticks(rotation=0)  # Rotate x-axis labels for better readability
plt.legend()
plt.tight_layout()
plt.show()