In [17]:

# author: Nekruz Ashrapov
# CS2023 Python Programming

# From my data I found that the Feature 0 or the 
# MedInc had the highest results with its R2 score being
# 0.4630810035698605 and MSE score 0.7197656965919479.
# The reason for it being high maybe because of the economic
# influence or the housing affordability. 
# Every other features scored lower than the MedInc


# Multiple Linear Regression using All features
# R2 score: 0.6008983115964328
# MSE score: 0.5350149774449123

# Feature 0 (MedInc)
# R2 score: 0.4630810035698605
# MSE score: 0.7197656965919479

# Feature 1 (HouseAge)
# R2 score: 0.013185632224592903
# MSE score: 1.3228720450408296

# Feature 2 (AveRooms)
# R2 score: 0.024105074271276283
# MSE score: 1.3082340086454287

# Feature 3 (AveBedrms)
# R2 score: -0.0011266270315772875
# MSE score: 1.3420583158224824

# Feature 4 (Population)
# R2 score: 8.471986797708997e-05
# MSE score: 1.3404344471369465

# Feature 5 (AveOccup)
# R2 score: -0.00018326453581640756
# MSE score: 1.340793693098357

# Feature 6 (Latitude)
# R2 score: 0.020368890210145207
# MSE score: 1.3132425427841639

# Feature 7 (Longitude)
# R2 score: 0.0014837207852688161
# MSE score: 1.3385590192298278




# The simple linear regression models are easier to 
# interpret than the multiple linear regression. 
# Another thing is that simple linear regr models 
# may have a high rating of bias in their data set 
# if the relationship between the independent and the 
# dependent variables are not captured or defined properly. 
# But on the other hand, the multi - linear models may have
# lower bias because of multiple variables.
# Overall, multi-linear is better because it can capture more
# info about the different features if there are more of them.

import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

cali = fetch_california_housing()
cali_df = pd.DataFrame(cali.data, columns=cali.feature_names)
cali_df['MedHouseValue'] = pd.Series(cali.target)

# splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(cali.data, cali.target, random_state=11)

# Fitting multiple linear regression model
mu_regress = LinearRegression()
mu_regress.fit(X_train, y_train)
predicted_mu = mu_regress.predict(X_test)

# calculating R2 score and mean squared error for multiple linear regression
r2_mu = metrics.r2_score(y_test, predicted_mu)
mse_mu = metrics.mean_squared_error(y_test, predicted_mu)
print("Multiple Linear Regression using All features")
print("R2 score:", r2_mu)
print("MSE score:", mse_mu)
print()

# iterating over each feature for linear regression
for i, feature in enumerate(cali.feature_names):
    
    X_train_simple = X_train[:, i:i+1]
    X_test_simple = X_test[:, i:i+1]
    
    # fitting the linear regression model
    simple_regress = LinearRegression()
    simple_regress.fit(X_train_simple, y_train)
    predicted_simple = simple_regress.predict(X_test_simple)
    
    # calculating the R2 score and mean squared error for simple linear regression
    r2_simple = metrics.r2_score(y_test, predicted_simple)
    mse_simple = metrics.mean_squared_error(y_test, predicted_simple)
    
    print(f"Feature {i} ({feature})")
    print("R2 score:", r2_simple)
    print("MSE score:", mse_simple)
    print()


Multiple Linear Regression using All features
R2 score: 0.6008983115964328
MSE score: 0.5350149774449123

Feature 0 (MedInc)
R2 score: 0.4630810035698605
MSE score: 0.7197656965919479

Feature 1 (HouseAge)
R2 score: 0.013185632224592903
MSE score: 1.3228720450408296

Feature 2 (AveRooms)
R2 score: 0.024105074271276283
MSE score: 1.3082340086454287

Feature 3 (AveBedrms)
R2 score: -0.0011266270315772875
MSE score: 1.3420583158224824

Feature 4 (Population)
R2 score: 8.471986797708997e-05
MSE score: 1.3404344471369465

Feature 5 (AveOccup)
R2 score: -0.00018326453581640756
MSE score: 1.340793693098357

Feature 6 (Latitude)
R2 score: 0.020368890210145207
MSE score: 1.3132425427841639

Feature 7 (Longitude)
R2 score: 0.0014837207852688161
MSE score: 1.3385590192298278

