# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
# Allows to work with arrays (data structure containing a group of elements)
import matplotlib.pyplot as plt
# Allows to plot graphs and charts
import pandas as pd
# Allows to import dataset,create matrix of features and dependant variable vector

## Importing the dataset

In [2]:
dataset = pd.read_csv('Company revenue dataset.csv')
# Simple name # Call pandas library # Call read.csv function from pandas library # Name in single quotes # Creates dataframe (values inside)
X = dataset.iloc[:, :-1].values
# Matrix of features # Range (takes all rows) # All columns except last one (Independant variables)
y = dataset.iloc[:, -1].values
# Dependant variable vector # Take indexes of columns and rows we want # Rows # Columns # Take only the last column
# Principle = Features vs dependant variable (last column)

In [3]:
print(X)

[[82742 135589 110792 'Gauteng']
 [86506 133953 121829 'Gauteng']
 [83011 149459 116213 'Gauteng']
 [80657 134052 120794 'Gauteng']
 [91762 130462 124515 'Gauteng']
 [91049 130544 124330 'Gauteng']
 [91254 138846 125180 'Gauteng']
 [80118 132733 115963 'Gauteng']
 [88983 131321 128832 'Gauteng']
 [90981 146427 124641 'Gauteng']
 [93422 136868 111058 'Gauteng']
 [86657 145656 112863 'Gauteng']
 [91911 145556 116367 'Gauteng']
 [88877 143728 122857 'Gauteng']
 [67038 122647 102130 'Western Cape']
 [72781 128229 92779 'Western Cape']
 [79530 114404 95046 'Western Cape']
 [71146 125531 103028 'Western Cape']
 [72764 114766 103828 'Western Cape']
 [69222 111056 109595 'Western Cape']
 [66247 126476 95921 'Western Cape']
 [79755 124049 104247 'Western Cape']
 [79115 119506 97056 'Western Cape']
 [71474 123847 90021 'Western Cape']
 [71811 129218 106519 'Western Cape']
 [77115 110580 106858 'Western Cape']
 [55695 101768 82659 'Kwa-Zulu Natal']
 [62297 104232 77526 'Kwa-Zulu Natal']
 [52678 1

## Encoding categorical data

In [4]:
from sklearn.compose import ColumnTransformer
# Import ColumnTransformer which applies transformers to columns of an array or pandas DataFrame  
from sklearn.preprocessing import OneHotEncoder
# Encodes a binary variable into 0 and 1
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# List of tuples specifying the transformer objects to be applied to subsets of the data # Only the specified columns in transformers are transformed and combined in the output, and the non-specified columns are dropped. 
X = np.array(ct.fit_transform(X))
# Fits the values into an array

In [5]:
print(X)

[[0.0 1.0 0.0 0.0 82742 135589 110792]
 [0.0 1.0 0.0 0.0 86506 133953 121829]
 [0.0 1.0 0.0 0.0 83011 149459 116213]
 [0.0 1.0 0.0 0.0 80657 134052 120794]
 [0.0 1.0 0.0 0.0 91762 130462 124515]
 [0.0 1.0 0.0 0.0 91049 130544 124330]
 [0.0 1.0 0.0 0.0 91254 138846 125180]
 [0.0 1.0 0.0 0.0 80118 132733 115963]
 [0.0 1.0 0.0 0.0 88983 131321 128832]
 [0.0 1.0 0.0 0.0 90981 146427 124641]
 [0.0 1.0 0.0 0.0 93422 136868 111058]
 [0.0 1.0 0.0 0.0 86657 145656 112863]
 [0.0 1.0 0.0 0.0 91911 145556 116367]
 [0.0 1.0 0.0 0.0 88877 143728 122857]
 [0.0 0.0 0.0 1.0 67038 122647 102130]
 [0.0 0.0 0.0 1.0 72781 128229 92779]
 [0.0 0.0 0.0 1.0 79530 114404 95046]
 [0.0 0.0 0.0 1.0 71146 125531 103028]
 [0.0 0.0 0.0 1.0 72764 114766 103828]
 [0.0 0.0 0.0 1.0 69222 111056 109595]
 [0.0 0.0 0.0 1.0 66247 126476 95921]
 [0.0 0.0 0.0 1.0 79755 124049 104247]
 [0.0 0.0 0.0 1.0 79115 119506 97056]
 [0.0 0.0 0.0 1.0 71474 123847 90021]
 [0.0 0.0 0.0 1.0 71811 129218 106519]
 [0.0 0.0 0.0 1.0 77115 110580

## Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split
# Import the training and test set splitter from sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# X and y are the arrays # test size 0.2 is the proportion of the dataset included in the test split # Controls the shuffling applied to the data before applying the split 

## Training the Multiple Linear Regression model on the Training set

In [7]:
from sklearn.linear_model import LinearRegression
# Import the linear regression model from sk learn 
regressor = LinearRegression()
# Use the linear regressor to show the relationship between the dependant and independant variables
regressor.fit(X_train, y_train)
# Train the model on the training set

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the Test set results

In [8]:
y_pred = regressor.predict(X_test)
# Use multiple linear regression to predict the results of the test set
np.set_printoptions(precision=2)
# Number of digits of precision for floating point output
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
# Join a sequence of arrays along an existing axis # Gives a new shape to an array without changing its data 

[[638618.4  674649.  ]
 [836232.56 826167.  ]
 [824852.41 844558.  ]
 [541111.38 504590.  ]
 [843819.58 896700.  ]
 [633470.6  630424.  ]
 [541561.23 564618.  ]
 [636915.95 601107.  ]
 [746181.76 794589.  ]
 [834570.68 820906.  ]]
