In [11]:
import pandas as pd
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
data = pd.read_csv("StudentScore.xls")
data

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [5]:
profile = ProfileReport(data, title="Student Score report", explorative=True)
profile.to_file("student_score_statistics.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:00<?, ?it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
target = "writing score"
x = data.drop(target, axis=1)
y = data[target]

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [14]:
num_tran = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

nom_tran = Pipeline(steps=[
    ("impute",SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(sparse_output=False))
])

gender_values = data["gender"].unique()
edu_values = ['some high school', 'high school', 'some college', "associate's degree", "bachelor's degree","master's degree"]
lunch_values = data["lunch"].unique()
test_values = data["test preparation course"].unique()


ord_tran = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ordinal", OrdinalEncoder(categories=[gender_values, edu_values, lunch_values, test_values]))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", num_tran, ["reading score","math score"] ),
    ("nom", nom_tran,["race/ethnicity"]),
    ("ord", ord_tran,["gender","parental level of education","lunch","test preparation course"])
])

In [15]:
re = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

re.fit(x_train, y_train)
y_predict = re.predict(x_test)

In [16]:
MAE = mean_absolute_error(y_test, y_predict)
MSE = mean_squared_error(y_test, y_predict)
R2 = r2_score(y_test, y_predict)
print("MAE:", MAE)
print("MSE:", MSE)
print("R2:", R2)

MAE: 3.2039447691582152
MSE: 14.980822041816763
R2: 0.9378432907399291
