# Detetcting Multicollinearity

In [48]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats

In [49]:
# Checking datasets available in seaborn
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [50]:
# Loading the dataset
df = sns.load_dataset("mpg")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [51]:
# Check shape
df.shape

(398, 9)

In [52]:
# Before encoding
df['origin'].value_counts()

usa       249
japan      79
europe     70
Name: origin, dtype: int64

In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["origin"] = le.fit_transform(df["origin"])
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,2,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,2,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,2,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,2,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,2,ford torino


In [54]:
# After Encoding
df['origin'].value_counts()

2    249
1     79
0     70
Name: origin, dtype: int64

In [55]:
# Select features using iloc
df_fe = df.iloc[:,1:8]
df_fe

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,2
1,8,350.0,165.0,3693,11.5,70,2
2,8,318.0,150.0,3436,11.0,70,2
3,8,304.0,150.0,3433,12.0,70,2
4,8,302.0,140.0,3449,10.5,70,2
...,...,...,...,...,...,...,...
393,4,140.0,86.0,2790,15.6,82,2
394,4,97.0,52.0,2130,24.6,82,0
395,4,135.0,84.0,2295,11.6,82,2
396,4,120.0,79.0,2625,18.6,82,2


In [56]:
# Check for null values
df_fe.isna().sum()

cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [57]:
# Dropping the null values
df_fe.dropna(inplace = True)

In [58]:
# Check dropped values
df_fe.isna().sum()

cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [59]:
# Correlation matrix
corr = df_fe.corr()
corr

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
cylinders,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647,0.556302
displacement,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855,0.594314
horsepower,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361,0.44733
weight,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912,0.521609
acceleration,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316,-0.264409
model_year,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0,-0.066892
origin,0.556302,0.594314,0.44733,0.521609,-0.264409,-0.066892,1.0


# Variance Inflation Factor (VIF)

In [60]:
# Generating VIF
pd.DataFrame(np.linalg.inv(corr.values), index = corr.index, columns = corr.columns)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
cylinders,10.656203,-9.935918,1.099434,-1.230775,0.265777,0.021888,0.1989
displacement,-9.935918,22.395948,-4.020491,-6.834961,0.76949,0.693628,-2.169325
horsepower,1.099434,-4.020491,9.626519,-4.359309,3.153797,0.679775,0.624802
weight,-1.230775,-6.834961,-4.359309,10.800432,-2.556822,-0.664567,0.342738
acceleration,0.265777,0.76949,3.153797,-2.556822,2.625845,0.138317,0.021251
model_year,0.021888,0.693628,0.679775,-0.664567,0.138317,1.284203,-0.259374
origin,0.1989,-2.169325,0.624802,0.342738,0.021251,-0.259374,1.708612


In [61]:
df_fe2 = df_fe.copy()
del df_fe2["displacement"]

df_fe2.head()

Unnamed: 0,cylinders,horsepower,weight,acceleration,model_year,origin
0,8,130.0,3504,12.0,70,2
1,8,165.0,3693,11.5,70,2
2,8,150.0,3436,11.0,70,2
3,8,150.0,3433,12.0,70,2
4,8,140.0,3449,10.5,70,2


In [63]:
# Correlation Table
corr = df_fe2.corr()
corr

Unnamed: 0,cylinders,horsepower,weight,acceleration,model_year,origin
cylinders,1.0,0.842983,0.897527,-0.504683,-0.345647,0.556302
horsepower,0.842983,1.0,0.864538,-0.689196,-0.416361,0.44733
weight,0.897527,0.864538,1.0,-0.416839,-0.30912,0.521609
acceleration,-0.504683,-0.689196,-0.416839,1.0,0.290316,-0.264409
model_year,-0.345647,-0.416361,-0.30912,0.290316,1.0,-0.066892
origin,0.556302,0.44733,0.521609,-0.264409,-0.066892,1.0


In [64]:
# Generating VIF
pd.DataFrame(np.linalg.inv(corr.values), index = corr.index, columns = corr.columns)

Unnamed: 0,cylinders,horsepower,weight,acceleration,model_year,origin
cylinders,6.248153,-0.684249,-4.263092,0.60716,0.329615,-0.763517
horsepower,-0.684249,8.904766,-5.586312,3.291935,0.804295,0.235367
weight,-4.263092,-5.586312,8.714488,-2.321984,-0.452881,-0.319312
acceleration,0.60716,3.291935,-2.321984,2.599406,0.114485,0.095786
model_year,0.329615,0.804295,-0.452881,0.114485,1.262721,-0.192188
origin,-0.763517,0.235367,-0.319312,0.095786,-0.192188,1.498486


In [73]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant

# Add a constant
x = add_constant(df_fe)
x.dropna(inplace = True)

In [74]:
vif_data = pd.DataFrame()
vif_data["feature_name"] = x.columns
vif_data["VIF"] = [variance_inflation_factor(x.values, i)
                        for i in range(x.shape[1])]

print(vif_data)

   feature_name         VIF
0         const  757.568654
1     cylinders   10.656203
2  displacement   22.395948
3    horsepower    9.626519
4        weight   10.800432
5  acceleration    2.625845
6    model_year    1.284203
7        origin    1.708612
