In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [2]:
#import the data

california = fetch_california_housing()
X = california.data
y = california.target
columns = california.feature_names

#convert the data into a pandas dataframe
df = pd.DataFrame(california.data, columns=columns)
df['Price'] = california.target
df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
#check for missing values
df.isnull().sum()


MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Price         0
dtype: int64

In [4]:
#check for duplicate values
df.duplicated().sum()



0

In [5]:
# feature variables and target variable
X = df.drop('Price', axis=1)
y = df['Price']

print(X.shape)
print(y.shape)

(20640, 8)
(20640,)


In [6]:
#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(16512, 8)
(4128, 8)
(16512,)
(4128,)


### Regression Models

Linear Regression

In [7]:
#instantiate the model
LinReg = LinearRegression()

#fit the model
LinReg.fit(X_train, y_train)

#make predictions
y_pred = LinReg.predict(X_test)

#evaluate the model

mse_linear = mean_squared_error(y_test, y_pred)
r2_linear = r2_score(y_test, y_pred)
rmse_linear =np.sqrt(mean_squared_error(y_test, y_pred))

print('Mean Squared Error:', mse_linear)
print('R2 Score:', r2_linear)
print('Root Mean Squared Error:', rmse_linear)

#plot the predictions

fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Prices', 'y': 'Predicted Prices'},
                 title='Actual Prices vs Predicted Prices', color_discrete_sequence=['blue', 'red'])

# Add diagonal line
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines'))

fig.update_layout(showlegend=True)
fig.show()



Mean Squared Error: 0.5558915986952441
R2 Score: 0.575787706032451
Root Mean Squared Error: 0.7455813830127762


Lasso Model

In [8]:
#instantiate the lasso model
lasso = Lasso(alpha=0.1)

#fit the model
lasso.fit(X_train, y_train)

#make predictions
y_pred_lasso = lasso.predict(X_test)

#evaluate the model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))

print('Mean Squared Error:', mse_lasso)
print('R2 Score:', r2_lasso)
print('Root Mean Squared Error:', rmse_lasso)

#plot the predictions
fig = px.scatter(x=y_test, y=y_pred_lasso, labels={'x': 'Actual Prices', 'y': 'Predicted Prices'},
                 title='Actual Prices vs Predicted Prices', color_discrete_sequence=['blue', 'red'])
fig.update_layout(showlegend=True)
fig.show()




Mean Squared Error: 0.6135115198058131
R2 Score: 0.5318167610318159
Root Mean Squared Error: 0.7832697618354822


Ridge Model

In [9]:
#instantiate the ridge model
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1)

#fit the model
ridge.fit(X_train, y_train)

#make predictions
y_pred_ridge = ridge.predict(X_test)

#evaluate the model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))

print('Mean Squared Error:', mse_ridge)
print('R2 Score:', r2_ridge)
print('Root Mean Squared Error:', rmse_ridge)

#plot the predictions
fig = px.scatter(x=y_test, y=y_pred_ridge, labels={'x': 'Actual Prices', 'y': 'Predicted Prices'},
                 title='Actual Prices vs Predicted Prices', color_discrete_sequence=['blue', 'red'])
fig.update_layout(showlegend=True)
fig.show()


Mean Squared Error: 0.55588275431138
R2 Score: 0.5757944553633934
Root Mean Squared Error: 0.7455754517896763


Decision Tree Regression

In [10]:
#instantiate the decision tree model
dt = DecisionTreeRegressor( random_state=42)

#fit the model
dt.fit(X_train, y_train)

#make predictions
y_pred_dt = dt.predict(X_test)

#evaluate the model
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))

print('Mean Squared Error:', mse_dt)
print('R2 Score:', r2_dt)
print('Root Mean Squared Error:', rmse_dt)

#plot the predictions
fig = px.scatter(x=y_test, y=y_pred_dt, labels={'x': 'Actual Prices', 'y': 'Predicted Prices'},
                 title='Actual Prices vs Predicted Prices', color_discrete_sequence=['blue', 'red'])
fig.update_layout(showlegend=True)
fig.show()


Mean Squared Error: 0.495235205629094
R2 Score: 0.622075845135081
Root Mean Squared Error: 0.7037294974840077


Random Forest

In [11]:
#instantiate the random forest model
from sklearn.ensemble import RandomForestRegressor
import plotly.graph_objects as go
rf = RandomForestRegressor(n_estimators=100, random_state=42)

#fit the model
rf.fit(X_train, y_train)

#make predictions
y_pred_rf = rf.predict(X_test)

#evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print('Mean Squared Error:', mse_rf)
print('R2 Score:', r2_rf)
print('Root Mean Squared Error:', rmse_rf)

#plot the predictions

fig = px.scatter(x=y_test, y=y_pred_rf, labels={'x': 'Actual Prices', 'y': 'Predicted Prices'},
                    title='Actual Prices vs Predicted Prices', color_discrete_sequence=['blue', 'red'])
fig.update_layout(showlegend=True)

# Add diagonal line
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='lines'))

fig.show()







Mean Squared Error: 0.2553684927247781
R2 Score: 0.8051230593157366
Root Mean Squared Error: 0.5053399773665033


If the points in the plot are close to a diagonal line from the bottom-left to the top-right of the plot, it suggests that the predicted values are close to the actual values, indicating a good model performance. If the points are scattered far from this line, it indicates that the model’s predictions are often far from the actual values.

**Mean Squared Error (MSE)**

This is the average of the squared differences between the actual and predicted values123. Your MSE is 0.2553684927247781, which means that the average squared difference between your model’s predictions and the actual values is approximately 0.255. The closer this value is to 0, the better2.

**R2 Score**

Also known as the coefficient of determination, this metric quantifies the proportion of the variance in the dependent variable that is predictable from the independent variables4567. Your R2 score is 0.8051230593157366, which means approximately 80.51% of the variance in your target variable can be explained by your features. The best possible score is 1.0, indicating a perfect fit4.

**Root Mean Squared Error (RMSE)**

This is the square root of the MSE89101112. It can be interpreted as the standard deviation of the residuals (prediction errors). Your RMSE is 0.5053399773665033, which means on average, your predictions are approximately 0.505 units away from the actual values. The closer this value is to 0, the better9.

Measuring the error

In [12]:
#compare the performance of the models
models = [('Linear Regression', mse_linear, r2_linear, rmse_linear),
          ('Lasso', mse_lasso, r2_lasso, rmse_lasso),
          ('Ridge', mse_ridge, r2_ridge, rmse_ridge),
          ('Decision Tree', mse_dt, r2_dt, rmse_dt),
          ('Random Forest', mse_rf, r2_rf, rmse_rf)]

model_comparison = pd.DataFrame(models, columns=['Model', 'MSE', 'R2 Score', 'RMSE'])

model_comparison

Unnamed: 0,Model,MSE,R2 Score,RMSE
0,Linear Regression,0.555892,0.575788,0.745581
1,Lasso,0.613512,0.531817,0.78327
2,Ridge,0.555883,0.575794,0.745575
3,Decision Tree,0.495235,0.622076,0.703729
4,Random Forest,0.255368,0.805123,0.50534


In [13]:
# plot the performance of the models

fig = go.Figure(data=[
    go.Bar(name='MSE', x=model_comparison['Model'], y=model_comparison['MSE']),
    go.Bar(name='R2 Score', x=model_comparison['Model'], y=model_comparison['R2 Score']),
    go.Bar(name='RMSE', x=model_comparison['Model'], y=model_comparison['RMSE'])
])

fig.update_layout(barmode='group', title='Model Performance Comparison')
fig.show()
