In [40]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [41]:
# load data
data = pd.read_csv('StudentScore.xls')

In [42]:
# Split data
x = data.drop('math score', axis = 1)
y = data[['math score']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 5)

In [43]:
num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

education_order = ['some high school', 'high school', 'some college',  "associate's degree",
                    "bachelor's degree", "master's degree" ]
gender_order = data['gender'].unique()
lunch_order = data['lunch'].unique()
test_order = data['test preparation course'].unique()


odr_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OrdinalEncoder(categories=[education_order, gender_order,lunch_order, test_order ]))
])

one_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[

    ('num', num_transform, ['reading score', 'writing score']),
    ('ord', odr_transform, ['parental level of education', 'gender', 'lunch', 'test preparation course']),
    ('one', one_transform, ['race/ethnicity'])
])

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)


In [44]:
LR = LinearRegression()
LR.fit(x_train, y_train)

In [45]:
y_pred = LR.predict(x_test)

In [46]:
print(f'r2_score: {r2_score(y_test, y_pred)}')
print(f'mean_squared_error: {mean_squared_error(y_test, y_pred)}')
print(f'mean_absolute_error: {mean_absolute_error(y_test, y_pred)}')

r2_score: 0.8754464316990902
mean_squared_error: 29.799080010644595
mean_absolute_error: 4.2976805706587085


In [47]:
y_test.shape, y_pred.shape

((200, 1), (200, 1))

In [50]:
y_pred = y_pred.ravel()  # Or use y_pred = y_pred.flatten()
y_test = y_pred.ravel()
# Create the DataFrame
data_output = {'True Value': y_test, 'Predicted Value': y_pred}
df = pd.DataFrame(data_output)

# Display the DataFrame
print(df)

     True Value  Predicted Value
0     56.930568        56.930568
1     75.090116        75.090116
2     64.743399        64.743399
3     52.190083        52.190083
4     59.683042        59.683042
..          ...              ...
195   48.638293        48.638293
196   68.011804        68.011804
197   32.848283        32.848283
198   58.340271        58.340271
199   87.366989        87.366989

[200 rows x 2 columns]


In [None]:
y_pred

In [None]:
y_test