# Asgmt: Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('housing.csv')



X = dataset.iloc[:, :-1].values 
y = dataset.iloc[:, -1].values  

In [3]:
print(X)

[[-1.2223e+02  3.7880e+01  4.1000e+01 ...  1.2600e+02  8.3252e+00
   4.5260e+05]
 [-1.2222e+02  3.7860e+01  2.1000e+01 ...  1.1380e+03  8.3014e+00
   3.5850e+05]
 [-1.2224e+02  3.7850e+01  5.2000e+01 ...  1.7700e+02  7.2574e+00
   3.5210e+05]
 ...
 [-1.2122e+02  3.9430e+01  1.7000e+01 ...  4.3300e+02  1.7000e+00
   9.2300e+04]
 [-1.2132e+02  3.9430e+01  1.8000e+01 ...  3.4900e+02  1.8672e+00
   8.4700e+04]
 [-1.2124e+02  3.9370e+01  1.6000e+01 ...  5.3000e+02  2.3886e+00
   8.9400e+04]]


In [3]:
print(y)

[192261.83 191792.06 191050.39 182901.99 166187.94 156991.12 156122.51
 155752.6  152211.77 149759.96 146121.95 144259.4  141585.52 134307.35
 132602.65 129917.04 126992.93 125370.37 124266.9  122776.86 118474.03
 111313.02 110352.25 108733.99 108552.04 107404.34 105733.54 105008.31
 103282.38 101004.64  99937.59  97483.56  97427.84  96778.92  96712.8
  96479.51  90708.19  89949.14  81229.06  81005.76  78239.91  77798.83
  71498.49  69758.98  65200.33  64926.08  49490.75  42559.73  35673.41
  14681.4 ]


## Encoding categorical data

In [4]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder 

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough') # 3 is the index of the column to be encoded = State
X = np.array(ct.fit_transform(X))

In [5]:
print(X) # location of the encoded column is changed 

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 185760 stored elements and shape (20640, 5934)>
  Coords	Values
  (0, 733)	1.0
  (0, 5926)	-122.23
  (0, 5927)	37.88
  (0, 5928)	41.0
  (0, 5929)	129.0
  (0, 5930)	322.0
  (0, 5931)	126.0
  (0, 5932)	8.3252
  (0, 5933)	452600.0
  (1, 5255)	1.0
  (1, 5926)	-122.22
  (1, 5927)	37.86
  (1, 5928)	21.0
  (1, 5929)	1106.0
  (1, 5930)	2401.0
  (1, 5931)	1138.0
  (1, 5932)	8.3014
  (1, 5933)	358500.0
  (2, 1316)	1.0
  (2, 5926)	-122.24
  (2, 5927)	37.85
  (2, 5928)	52.0
  (2, 5929)	190.0
  (2, 5930)	496.0
  (2, 5931)	177.0
  :	:
  (20637, 5927)	39.43
  (20637, 5928)	17.0
  (20637, 5929)	485.0
  (20637, 5930)	1007.0
  (20637, 5931)	433.0
  (20637, 5932)	1.7
  (20637, 5933)	92300.0
  (20638, 1709)	1.0
  (20638, 5926)	-121.32
  (20638, 5927)	39.43
  (20638, 5928)	18.0
  (20638, 5929)	409.0
  (20638, 5930)	741.0
  (20638, 5931)	349.0
  (20638, 5932)	1.8672
  (20638, 5933)	84700.0
  (20639, 2633)	1.0
  (20639, 5926)	-121.24
  (20639, 592

## Splitting the dataset into the Training set and Test set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Here it check for the statically significant features and check for the p-values, since it small it is statically significant

## Predicting the Test set results

In [12]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2) # set the decimal places to 2

print(y_pred) # Horizontal vector of predicted values
print(y_test) # Horizontal vector of real values

[103015.2  132582.28 132447.74  71976.1  178537.48 116161.24  67851.69
  98791.73 113969.44 167921.07]
[103282.38 144259.4  146121.95  77798.83 191050.39 105008.31  81229.06
  97483.56 110352.25 166187.94]


In [None]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) # compare the predicted and actual values

# .reshape() = convert the horizontal vector to vertical vector
# (len(y_pred),1) = number of rows and columns (10 rows and 1 column)

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


Dummy Variable Trap 

-Occur during one hot encoding when we want to transfer categorical data to binary number
-when there is 2 or more highly correlated variable where one variable is corelate as an example in Pressure = Force / Area ... Force is related to the pressure this can cause dummy variable

This all only happen if you are not using library
