## Multiple Linear regression
### This a straightforward All-In model training practise.


In [3]:
import numpy as np
import matplotlib.pyplot as plt     # for plotting
import pandas as pd                # for data handling

In [4]:
# Importing the dataset
df = pd.read_csv('./50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [32]:
# Splitting the dataset into the independent and dependent variables
x = df.iloc[:, :-1].values
y = df.iloc[:, 4].values


In [36]:
x_test = x
x_test

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

In [None]:
## Encoding categorical data using pandas
# df_enc_pandas = pd.get_dummies(x_test, columns=[3])

In [17]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

### Label Encoding 
###### Label Encoding is a technique that is used to convert categorical columns into numerical ones so that they can be fitted by machine learning models which only take numerical data. It is an important pre-processing step in a machine-learning project

In [None]:
LabelEncoder_x = LabelEncoder()
x[:, 3] = LabelEncoder_x.fit_transform(x[:, 3]) 


In [39]:
x

array([[165349.2, 136897.8, 471784.1, 2],
       [162597.7, 151377.59, 443898.53, 0],
       [153441.51, 101145.55, 407934.54, 1],
       [144372.41, 118671.85, 383199.62, 2],
       [142107.34, 91391.77, 366168.42, 1],
       [131876.9, 99814.71, 362861.36, 2],
       [134615.46, 147198.87, 127716.82, 0],
       [130298.13, 145530.06, 323876.68, 1],
       [120542.52, 148718.95, 311613.29, 2],
       [123334.88, 108679.17, 304981.62, 0],
       [101913.08, 110594.11, 229160.95, 1],
       [100671.96, 91790.61, 249744.55, 0],
       [93863.75, 127320.38, 249839.44, 1],
       [91992.39, 135495.07, 252664.93, 0],
       [119943.24, 156547.42, 256512.92, 1],
       [114523.61, 122616.84, 261776.23, 2],
       [78013.11, 121597.55, 264346.06, 0],
       [94657.16, 145077.58, 282574.31, 2],
       [91749.16, 114175.79, 294919.57, 1],
       [86419.7, 153514.11, 0.0, 2],
       [76253.86, 113867.3, 298664.47, 0],
       [78389.47, 153773.43, 299737.29, 2],
       [73994.56, 122782.75, 30331

#### One Hot Encoding 
###### One hot encoding is a technique that we use to represent categorical variables as numerical values in a machine learning model.

In [45]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

In [46]:
# Extract the last column for encoding
last_column = x[:, -1].reshape(-1, 1)
# last_column

In [47]:
# Encode the last column
one_hot_encoded = encoder.fit_transform(last_column)

In [49]:
# x
new_x = np.hstack((x[:, :-1], one_hot_encoded))

In [50]:
new_x

array([[165349.2, 136897.8, 471784.1, 0.0, 0.0, 1.0],
       [162597.7, 151377.59, 443898.53, 1.0, 0.0, 0.0],
       [153441.51, 101145.55, 407934.54, 0.0, 1.0, 0.0],
       [144372.41, 118671.85, 383199.62, 0.0, 0.0, 1.0],
       [142107.34, 91391.77, 366168.42, 0.0, 1.0, 0.0],
       [131876.9, 99814.71, 362861.36, 0.0, 0.0, 1.0],
       [134615.46, 147198.87, 127716.82, 1.0, 0.0, 0.0],
       [130298.13, 145530.06, 323876.68, 0.0, 1.0, 0.0],
       [120542.52, 148718.95, 311613.29, 0.0, 0.0, 1.0],
       [123334.88, 108679.17, 304981.62, 1.0, 0.0, 0.0],
       [101913.08, 110594.11, 229160.95, 0.0, 1.0, 0.0],
       [100671.96, 91790.61, 249744.55, 1.0, 0.0, 0.0],
       [93863.75, 127320.38, 249839.44, 0.0, 1.0, 0.0],
       [91992.39, 135495.07, 252664.93, 1.0, 0.0, 0.0],
       [119943.24, 156547.42, 256512.92, 0.0, 1.0, 0.0],
       [114523.61, 122616.84, 261776.23, 0.0, 0.0, 1.0],
       [78013.11, 121597.55, 264346.06, 1.0, 0.0, 0.0],
       [94657.16, 145077.58, 282574.31, 0.