In [62]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor

data = pd.read_csv('Housing.csv')
df = pd.DataFrame(data)

#separate X and y
X = df.drop(columns = ['price'])
y = df['price']

#get dummies
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64','float64']).columns.tolist()

df_encoded = pd.get_dummies(X,columns=categorical_columns)

X = pd.concat([df_encoded,df[numerical_columns]],axis=1)

#scale
scaler = StandardScaler()

X = scaler.fit_transform(X)


#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#define a model
model  = LinearRegression()

model.fit(X_train,y_train)

#get mean squared error
y_train_pred = model.predict(X_train)

y_test_pred = model.predict(X_test)

cv = -cross_val_score(model,X,y,scoring='neg_mean_squared_error', cv=5)

mse_train = mean_squared_error(y_train,y_train_pred)

mse_cv = np.mean(cv)

mse_test = mean_squared_error(y_test_pred,y_test)


#calculate R squared

test_r_squared = model.score(X_test,y_test)
train_r_squared = model.score(X_train,y_train)

#define a baseline function
baseline = DummyRegressor()

baseline.fit(X_train,y_train)

baseline_train_pred = baseline.predict(X_train)

baseline_test_pred = baseline.predict(X_test)

baseline_cv = -cross_val_score(baseline,X,y,scoring='neg_mean_squared_error', cv=5)

baseline_mse_train = mean_squared_error(y_train,baseline_train_pred)

baseline_mse_cv = np.mean(baseline_cv)

baseline_mse_test = mean_squared_error(baseline_test_pred,y_test)



print('mse test = ',mse_test)
print('mse train = ',mse_train)
print('mse cv = ',mse_cv)
print('---------------------')
print('baseline_mse_test = ',baseline_mse_test)
print('baseline_mse_train = ',baseline_mse_train)
print('baseline_mse_cv = ',baseline_mse_cv)
print('---------------------')
print('Ratio mse test = ',baseline_mse_test / mse_test)
print('Ratio mse train = ',baseline_mse_train / mse_train)
print('Ratio mse cv = ',baseline_mse_cv / mse_cv)
print('---------------------')
print('R Squared Train = ',train_r_squared)
print('R Squared Test = ',test_r_squared)

mse test =  1754318687330.6677
mse train =  968358188440.7242
mse cv =  2156684121886.472
---------------------
baseline_mse_test =  5145176852226.263
baseline_mse_train =  3083392377710.1357
baseline_mse_cv =  5159851341291.697
---------------------
Ratio mse test =  2.9328632758595585
Ratio mse train =  3.184144477236357
Ratio mse cv =  2.39249284998599
---------------------
R Squared Train =  0.6859438988560158
R Squared Test =  0.6529242642153177
