## Multiple Linear regression
### This a straightforward All-In model training practise.


In [1]:
import numpy as np
import matplotlib.pyplot as plt     # for plotting
import pandas as pd                # for data handling

In [2]:
# Importing the dataset
df = pd.read_csv('./50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# Splitting the dataset into the independent and dependent variables
x = df.iloc[:, :-1].values
y = df.iloc[:, 4].values


In [4]:
x_test = x
x_test

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

#### Now I have 3 choice to encode the data
1. using pandas before spliting
2. using pandas after spliting
3. using scikit-learn to encode

#### Encode using scikit-learn

In [5]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

### Label Encoding 
###### Label Encoding is a technique that is used to convert categorical columns into numerical ones so that they can be fitted by machine learning models which only take numerical data. It is an important pre-processing step in a machine-learning project

In [6]:
# LabelEncoder_x = LabelEncoder()
# x[:, 3] = LabelEncoder_x.fit_transform(x[:, 3]) 


In [7]:
# x

#### One Hot Encoding 
###### One hot encoding is a technique that we use to represent categorical variables as numerical values in a machine learning model.

In [8]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

In [9]:
# Extract the last column for encoding
last_column = x[:, -1].reshape(-1, 1)
last_column

array([['New York'],
       ['California'],
       ['Florida'],
       ['New York'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['Florida'],
       ['California'],
       ['Florida'],
       ['California'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['New York'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['New York'],
       ['Florida'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['Florida'],
       ['New York'],
       ['Florida'],
       ['New York'],
       ['Florida'],
       ['New York'],
       ['California'],
       ['Florida'],
       ['California'],
       ['New York'],
       ['Florida'],
       ['California'],
       ['New York'],
       ['California'],
       ['California'],
       ['Florida'],
       ['California'],
       ['New York'],
       ['California'],
       ['New York'],
       ['Florida'],

In [10]:
# Encode the last column
encoded_city = encoder.fit_transform(last_column)
encoded_city

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0

In [11]:
# x
encoded_x = np.hstack((x[:, :-1], encoded_city))
encoded_x

array([[165349.2, 136897.8, 471784.1, 0.0, 0.0, 1.0],
       [162597.7, 151377.59, 443898.53, 1.0, 0.0, 0.0],
       [153441.51, 101145.55, 407934.54, 0.0, 1.0, 0.0],
       [144372.41, 118671.85, 383199.62, 0.0, 0.0, 1.0],
       [142107.34, 91391.77, 366168.42, 0.0, 1.0, 0.0],
       [131876.9, 99814.71, 362861.36, 0.0, 0.0, 1.0],
       [134615.46, 147198.87, 127716.82, 1.0, 0.0, 0.0],
       [130298.13, 145530.06, 323876.68, 0.0, 1.0, 0.0],
       [120542.52, 148718.95, 311613.29, 0.0, 0.0, 1.0],
       [123334.88, 108679.17, 304981.62, 1.0, 0.0, 0.0],
       [101913.08, 110594.11, 229160.95, 0.0, 1.0, 0.0],
       [100671.96, 91790.61, 249744.55, 1.0, 0.0, 0.0],
       [93863.75, 127320.38, 249839.44, 0.0, 1.0, 0.0],
       [91992.39, 135495.07, 252664.93, 1.0, 0.0, 0.0],
       [119943.24, 156547.42, 256512.92, 0.0, 1.0, 0.0],
       [114523.61, 122616.84, 261776.23, 0.0, 0.0, 1.0],
       [78013.11, 121597.55, 264346.06, 1.0, 0.0, 0.0],
       [94657.16, 145077.58, 282574.31, 0.

### Model Train

In [12]:
# Avoiding the Dummy Variable Trap
final_x = encoded_x[:, :-1]
final_x

array([[165349.2, 136897.8, 471784.1, 0.0, 0.0],
       [162597.7, 151377.59, 443898.53, 1.0, 0.0],
       [153441.51, 101145.55, 407934.54, 0.0, 1.0],
       [144372.41, 118671.85, 383199.62, 0.0, 0.0],
       [142107.34, 91391.77, 366168.42, 0.0, 1.0],
       [131876.9, 99814.71, 362861.36, 0.0, 0.0],
       [134615.46, 147198.87, 127716.82, 1.0, 0.0],
       [130298.13, 145530.06, 323876.68, 0.0, 1.0],
       [120542.52, 148718.95, 311613.29, 0.0, 0.0],
       [123334.88, 108679.17, 304981.62, 1.0, 0.0],
       [101913.08, 110594.11, 229160.95, 0.0, 1.0],
       [100671.96, 91790.61, 249744.55, 1.0, 0.0],
       [93863.75, 127320.38, 249839.44, 0.0, 1.0],
       [91992.39, 135495.07, 252664.93, 1.0, 0.0],
       [119943.24, 156547.42, 256512.92, 0.0, 1.0],
       [114523.61, 122616.84, 261776.23, 0.0, 0.0],
       [78013.11, 121597.55, 264346.06, 1.0, 0.0],
       [94657.16, 145077.58, 282574.31, 0.0, 0.0],
       [91749.16, 114175.79, 294919.57, 0.0, 1.0],
       [86419.7, 153514.1

In [13]:
# split the dataset into the training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(final_x, y, test_size = 0.2, random_state = 0)

###### Fitting Model

In [14]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

In [15]:
# Predicting the test set results
y_pred = regressor.predict(x_test)

In [16]:
y_pred

array([103015.20159776, 132582.27760831, 132447.73845184,  71976.09851266,
       178537.4822107 , 116161.24230157,  67851.69209689,  98791.73374679,
       113969.43533008, 167921.06569569])