In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

data = pd.read_csv('exams.csv')
df = pd.DataFrame(data)

#identify X and y
#note: on first try, I took all available columns for X and only dropped the y column
#The model suffered from Overfitting so I dropped 'lunch' column as well and the model performed better
X = df.drop(columns = ['writing score','lunch'])
y = df['writing score']

#split numeric and non numeric columns
non_numeric = X.select_dtypes(include=['object']).columns.tolist()
numeric = X.select_dtypes(include=['int64','float64']).columns.tolist()

#encode data
df_encoded = pd.get_dummies(X,columns = non_numeric)
X = pd.concat([df_encoded,df[numeric]],axis = 1)

#scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)

#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#define a model and fit
model = LinearRegression()
model.fit(X_train,y_train)

#get predict values for train and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

#define cross validation 
cv = -cross_val_score(model,X,y,scoring = 'neg_mean_squared_error',cv = 5)

#calculate mean squared errors
mse_train = mean_squared_error(y_train,y_train_pred)
mse_cv = np.mean(cv)
mse_test = mean_squared_error(y_test,y_test_pred)

#print the results
print('mes train= ',mse_train)
print('mes cv= ',mse_cv)
print('mse test= ',mse_test)

mes train=  13.263733207582323
mes cv=  13.323925108045097
mse test=  11.849731567308305
