# Multiple Linear Regression

### Importing the libraries

In [85]:
import numpy as np
import pandas as pd

### Importing the dataset

In [86]:
df = pd.read_csv('housing.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [88]:
y = df.iloc[:, -2].values
y[:2]

array([452600., 358500.])

In [89]:
df = df.drop("median_house_value", axis=1)
X = df.iloc[:, :9].values
X[:2]

array([[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 8.3252,
        'NEAR BAY'],
       [-122.22, 37.86, 21.0, 7099.0, 1106.0, 2401.0, 1138.0, 8.3014,
        'NEAR BAY']], dtype=object)

## Handle missing values

In [90]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X[:, 4:5])
X[:, 4:5] = imputer.transform(X[:, 4:5])
X[289:294, 4:5]

array([[128.0],
       [280.0],
       [194.0],
       [397.0],
       [349.0]], dtype=object)

## Handle categorical variables

In [91]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [8])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X[:9]

array([[0.0, 0.0, 0.0, 1.0, 0.0, -122.23, 37.88, 41.0, 880.0, 129.0,
        322.0, 126.0, 8.3252],
       [0.0, 0.0, 0.0, 1.0, 0.0, -122.22, 37.86, 21.0, 7099.0, 1106.0,
        2401.0, 1138.0, 8.3014],
       [0.0, 0.0, 0.0, 1.0, 0.0, -122.24, 37.85, 52.0, 1467.0, 190.0,
        496.0, 177.0, 7.2574],
       [0.0, 0.0, 0.0, 1.0, 0.0, -122.25, 37.85, 52.0, 1274.0, 235.0,
        558.0, 219.0, 5.6431],
       [0.0, 0.0, 0.0, 1.0, 0.0, -122.25, 37.85, 52.0, 1627.0, 280.0,
        565.0, 259.0, 3.8462],
       [0.0, 0.0, 0.0, 1.0, 0.0, -122.25, 37.85, 52.0, 919.0, 213.0,
        413.0, 193.0, 4.0368],
       [0.0, 0.0, 0.0, 1.0, 0.0, -122.25, 37.84, 52.0, 2535.0, 489.0,
        1094.0, 514.0, 3.6591],
       [0.0, 0.0, 0.0, 1.0, 0.0, -122.25, 37.84, 52.0, 3104.0, 687.0,
        1157.0, 647.0, 3.12],
       [0.0, 0.0, 0.0, 1.0, 0.0, -122.26, 37.84, 42.0, 2555.0, 665.0,
        1206.0, 595.0, 2.0804]], dtype=object)

## Split the dataset into training and test sets

In [92]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 4:] = sc.fit_transform(X_train[:, 4:])
X_test[:, 4:] = sc.transform(X_test[:, 4:])

## Train the multiple linear regression model on the training sets


In [93]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

## Prediction using the test set

In [94]:
y_pred = lr.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[ 38123.09  47700.  ]
 [108292.98  45800.  ]
 [239557.02 500001.  ]
 ...
 [439180.98 500001.  ]
 [120797.55  72300.  ]
 [183386.05 151500.  ]]


## Evaluating model performance

In [95]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6233392359648222