# Asgmt: Multiple Linear Regression on Housing.csv On Selected Feature



## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset & Check Data


In [2]:
dataset = pd.read_csv('housing.csv')

X = dataset[['median_income', 'ocean_proximity', 'latitude', 'longitude', 'housing_median_age']]
y = dataset['median_house_value'].values          

#X = dataset.iloc[:, :-1].values 
#y = dataset.iloc[:, -1].values  

In [3]:
print(X)

       median_income ocean_proximity  latitude  longitude  housing_median_age
0             8.3252        NEAR BAY     37.88    -122.23                  41
1             8.3014        NEAR BAY     37.86    -122.22                  21
2             7.2574        NEAR BAY     37.85    -122.24                  52
3             5.6431        NEAR BAY     37.85    -122.25                  52
4             3.8462        NEAR BAY     37.85    -122.25                  52
...              ...             ...       ...        ...                 ...
20635         1.5603          INLAND     39.48    -121.09                  25
20636         2.5568          INLAND     39.49    -121.21                  18
20637         1.7000          INLAND     39.43    -121.22                  17
20638         1.8672          INLAND     39.43    -121.32                  18
20639         2.3886          INLAND     39.37    -121.24                  16

[20640 rows x 5 columns]


In [4]:
print(y)


[452600 358500 352100 ...  92300  84700  89400]


## Encoding categorical data

In [5]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder 

# Handle NaN for the categorical column before encoding
#dataset[['median_income']] = dataset.iloc[:, 8].fillna('Unknown') 

# OneHotEncoding
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough') 
X = np.array(ct.fit_transform(X))

# Fill any remaining NaN after encoding
#X = pd.DataFrame(X).fillna(0).values 


In [6]:
print(X)

[[   0.      0.      0.   ...   37.88 -122.23   41.  ]
 [   0.      0.      0.   ...   37.86 -122.22   21.  ]
 [   0.      0.      0.   ...   37.85 -122.24   52.  ]
 ...
 [   0.      1.      0.   ...   39.43 -121.22   17.  ]
 [   0.      1.      0.   ...   39.43 -121.32   18.  ]
 [   0.      1.      0.   ...   39.37 -121.24   16.  ]]


## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
print(X_train)

[[   0.      1.      0.   ...   33.83 -117.55    6.  ]
 [   0.      0.      0.   ...   37.73 -122.44   52.  ]
 [   1.      0.      0.   ...   33.83 -118.     26.  ]
 ...
 [   0.      0.      0.   ...   36.58 -121.9    31.  ]
 [   1.      0.      0.   ...   33.62 -117.93   34.  ]
 [   0.      1.      0.   ...   32.8  -115.56   15.  ]]


## Training the Multiple Linear Regression model on the Training set

In [9]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Here it check for the statically significant features and check for the p-values, since it small it is statically significant

## Predicting the Test set results

In [10]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2) # set the decimal places to 2

print(y_pred) # Horizontal vector of predicted values
print(y_test) # Horizontal vector of real values

[242838.13 293231.9  179809.13 ...  93610.03 257121.85 219281.63]
[136900 241300 200700 ... 128600 259500 167600]


In [11]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) # compare the predicted and actual values

[[242838.13 136900.  ]
 [293231.9  241300.  ]
 [179809.13 200700.  ]
 ...
 [ 93610.03 128600.  ]
 [257121.85 259500.  ]
 [219281.63 167600.  ]]


In [12]:
from sklearn.metrics import r2_score, mean_squared_error

print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


R2 Score: 0.5940248861876689
MSE: 5293732398.733887
