In [None]:
#set up modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# read the csv file
df=pd.read_csv(r"C:\Users\ivan3\Documents\Visual Studio 2017\Projects\python\basic\practice project\student_performance\StudentsPerformance.csv")
print(df.head())
print(f"Shape of DataFrame: total rows={df.shape[0]} and total columns={df.shape[1]}.")

In [None]:
#The distribution of ["math score", "reading score", "writing score"]
plt.rcParams['figure.facecolor'] = "#e6ecff"
plt.rcParams['axes.facecolor'] = "#e6ecff"
df["average score"]=df[["math score", "reading score", "writing score"]].mean(axis=1)
fig,axes = plt.subplots(2,2,figsize=(10,8))
sns.kdeplot(data=df["math score"], color="lightcoral",shade=True,ax=axes[0,0]).set(title="math score",xlabel=None)
sns.kdeplot(data=df["reading score"], color="lightgreen",shade=True, ax=axes[0,1]).set(title="reading score",xlabel=None)
sns.kdeplot(data=df["writing score"], color="orange",shade=True, ax=axes[1,0]).set(title="writing score",xlabel=None)
sns.kdeplot(data=df[["math score","reading score","writing score","average score"]], ax=axes[1,1])
plt.legend(labels=("math score","reading score","writing score","average score"),loc='upper left')
plt.show()

Observation: All distribution follow an approximately normal distribution.

In [None]:
#The relationship between different score
#["math score","reading score","writing score","average score"]
fig,axes = plt.subplots(3,1,figsize=(10,10))
sns.regplot(x="math score", y="reading score", data=df,color="g",ax=axes[0])
sns.regplot(x="math score", y="writing score", data=df,color="b",ax=axes[1])
sns.regplot(x="writing score", y="reading score", data=df,color="purple",ax=axes[2])
plt.show()

Observation: All scores have a positive correlation with each other.

In [None]:
#The relationship between gender and score
plt.figure(figsize=(15,8))
for i,score in enumerate(["math score", "writing score", "reading score", "average score"]):
    plt.subplot(1,4,i+1)
    ax=sns.barplot(x="gender", y=score, data=df, palette="seismic", ci=None)
    plt.title(f'{score}')
    ax.bar_label(ax.containers[0], fmt='%.2f')
plt.show()


fig,axes = plt.subplots(2,2,figsize=(10,8))
ax=axes.flatten()
for i,score in enumerate(["math score","reading score","writing score","average score"]):
    sns.boxplot(x="gender", y=score, data=df, ax=ax[i])


Observation: Boys perform better than girls in math but perform worse than girls in both reading and writing test. Overall, Girls get a slightly higher average score.

In [None]:
#The relationship between race/ethnicity and score
race=df.groupby("race/ethnicity")['math score', 'reading score', 'writing score',"average score"].mean()

sns.set_palette("coolwarm")
ax=race.plot.barh()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

Observation: On average, group E get the best result and group A get the worst result at all three tests.

In [None]:
#The relationship between parental level of education and score
education=df.groupby("parental level of education")['math score', 'reading score', 'writing score',"average score"].mean()

sns.set_palette("coolwarm")
ax=education.plot.barh()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

Observation: When students' parent have a bachelor's degree or master's degree, the students seem to have a better average results.

In [None]:
#The relationship between having lunch and score
lunch=df.groupby("lunch")['math score', 'reading score', 'writing score',"average score"].mean()
print(lunch)
ax=sns.set_palette("coolwarm")
ax=lunch.plot.barh()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2)

fig,axes = plt.subplots(2,2,figsize=(10,8))
ax=axes.flatten()
for i,score in enumerate(["math score","reading score","writing score","average score"]):
    sns.boxplot(x="lunch", y=score, data=df, ax=ax[i])

Observation: It is obvious that keeping a balanced diet will improve your academic results.

In [None]:
#The relationship between test preparation course and score
plt.figure(figsize=(15,8))
for i,score in enumerate(["math score", "writing score", "reading score", "average score"]):
    plt.subplot(1,4,i+1)
    ax=sns.barplot(x="test preparation course", y=score, data=df, palette="seismic", ci=None)
    plt.title(f'{score}')
    ax.bar_label(ax.containers[0], fmt='%.2f')
plt.show()

fig,ax = plt.subplots(2,2,figsize=(10,8))
ax=ax.flatten()
sns.set_palette("vlag")
for i, score in enumerate(['math score', 'reading score', 'writing score',"average score"]):
    sns.boxplot(x="test preparation course", y=score, data=df, ax=ax[i])
plt.show()



Observation: On average, students who completed the test preparation course performed better on the test.


In [None]:
#finding outlier
fig,ax=plt.subplots(2,2,figsize=(10,8))
ax=ax.flatten()
for i,score in enumerate(['math score', 'reading score', 'writing score',"average score"]):
    sns.boxplot( y=score,data=df,ax=ax[i])
plt.show()

In [None]:
#remove outlier
def outlier(df, column):
    global lower, upper
    q1,q3 = np.quantile(df[column], 0.25), np.quantile(df[column], 0.75)
    iqr=q3-q1
    cut_off=iqr*1.5
    lower,upper =q1-cut_off,q3+cut_off
    print("iqr :", iqr,"\n","lower fence :", lower ,"\n", "upper fence :", upper)

    df_lower=df[df[column]<lower]
    df_upper=df[df[column]>upper]
    print (df_lower.shape,df_upper.shape)

    df=df[ (df[column]>lower) & (df[column]<upper)]
    return df

for score in ['math score', 'reading score', 'writing score',"average score"]:
    df=outlier(df, score)

print(df.shape)



ObservationL: The row of DataFrame decreases from 1054 to 1037, It means we remove 17 outliers.

In [None]:
#finding all columns which contain categorical data
o=(df.dtypes=="object")
object_col=list(o[o].index)
print("object_columns :", object_col)

In [None]:
#convert categorical variable to numeric variable
#using label encoding
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()

#create a new DataFrame 
df_new=df.copy()
for columns in object_col:
    df_new[columns]=label_encoder.fit_transform(df[columns])


In [None]:
#the correlation between each columns
plt.figure(figsize=(12,10))
plt.title("correlation")
sns.heatmap(data=df_new.corr(), annot=True, cmap="Blues")
plt.show()

In [None]:
#student performance prediction
y=df_new["average score"]
print(y.info())

X=df_new.drop(["average score", "math score", "writing score", "reading score"], axis=1)
print(X.info())

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y= train_test_split(X,y,train_size=0.8, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=500, max_depth=6)
model.fit(train_X, train_y)
predict=model.predict(val_X)

from sklearn.metrics import mean_absolute_error
result=pd.DataFrame({"actual":val_y, "predict":predict})
print("The mean_absolute_error of prediction :", mean_absolute_error(val_y,predict))
print(result.sort_index(axis=0))

sns.scatterplot(val_y,predict)
plt.show()

In [None]:
#feature importance
feature_importance = np.array(model.feature_importances_)
feature_name=np.array(train_X.columns)
feature_df=pd.DataFrame({"feature_importance":feature_importance, "feature_name":feature_name})
feature_df=feature_df.sort_values(by=["feature_importance"], ascending=False)
sns.barplot(x=feature_df["feature_importance"], y=feature_df["feature_name"])
plt.show()

Observation: From the graph, we know that having balanced diet and attending the test preparation course are the two biggest factors to improve our academic result.

Further study:
1.adding more features, such as study hour per week / pass test score / interest of the subjects / teaching method ...
2.increae the sample size to get better prediction