## Multicollinearity handiling using VIF

Variance Inflation Factor 

In [1]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [2]:
df = pd.read_csv('./Concrete_Data.csv')
df

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cement            1030 non-null   float64
 1   slag              1030 non-null   float64
 2   flyash            1030 non-null   float64
 3   water             1030 non-null   float64
 4   superplasticizer  1030 non-null   float64
 5   coarseaggregate   1030 non-null   float64
 6   fineaggregate     1030 non-null   float64
 7   age               1030 non-null   int64  
 8   csMPa             1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [4]:
df.describe()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


#### Checking null values

In [5]:
df.isnull().mean()*100

cement              0.0
slag                0.0
flyash              0.0
water               0.0
superplasticizer    0.0
coarseaggregate     0.0
fineaggregate       0.0
age                 0.0
csMPa               0.0
dtype: float64

#### Checking duplicated values

In [6]:
df.duplicated().sum()

25

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.columns

Index(['cement', 'slag', 'flyash', 'water', 'superplasticizer',
       'coarseaggregate', 'fineaggregate', 'age', 'csMPa'],
      dtype='object')

### Shape

In [9]:
df.shape

(1005, 9)

In [10]:
x = df[['cement', 'slag', 'flyash', 'water', 'superplasticizer',
       'coarseaggregate', 'fineaggregate', 'age']]

In [11]:
x = sm.add_constant(x)

In [12]:
x

Unnamed: 0,const,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age
0,1.0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,1.0,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,1.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,1.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,1.0,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360
...,...,...,...,...,...,...,...,...,...
1025,1.0,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28
1026,1.0,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28
1027,1.0,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28
1028,1.0,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28


In [13]:
vif = pd.DataFrame()

In [14]:
vif["variable"] = x.columns

In [15]:
vif

Unnamed: 0,variable
0,const
1,cement
2,slag
3,flyash
4,water
5,superplasticizer
6,coarseaggregate
7,fineaggregate
8,age


In [16]:
vif["VIF"] = [sm.OLS(x[col],x.drop(col,axis=1)).fit().rsquared for col in x.columns]

"sm.OLS(x[col], x.drop(col, axis=1)).fit()":

This part of the code is using the statsmodels library to perform Ordinary Least Squares (OLS) regression. It's fitting a linear regression model to predict the values of the column "col" using all other columns except "col". This is essentially estimating the relationship between the selected column and all other columns, treating them as independent variables.


"rsquared": After fitting the OLS regression model, the rsquared attribute is used to retrieve the coefficient of determination (R-squared) value.

###### VIF = 1 / (1 - R^2)

In [17]:
vif.sort_values(by='VIF',ascending=False)

Unnamed: 0,variable,VIF
0,const,0.999849
1,cement,0.865754
2,slag,0.862321
7,fineaggregate,0.856575
4,water,0.853769
3,flyash,0.835649
6,coarseaggregate,0.798233
5,superplasticizer,0.651244
8,age,0.107233


#### Dropping some columns having higher VIF value

In [18]:
x1 = df[[ 'flyash','superplasticizer','fineaggregate',
       'coarseaggregate', 'age']]

In [19]:
x1 = sm.add_constant(x1)

In [20]:
vif1 = pd.DataFrame()

In [21]:
vif1['Variables'] = x1.columns

In [22]:
vif1

Unnamed: 0,Variables
0,const
1,flyash
2,superplasticizer
3,fineaggregate
4,coarseaggregate
5,age


In [23]:
vif1['VIF'] = [sm.OLS(x1[col],x1.drop(col,axis=1)).fit().rsquared for col in x1.columns]

In [24]:
vif1

Unnamed: 0,Variables,VIF
0,const,0.996807
1,flyash,0.183126
2,superplasticizer,0.256177
3,fineaggregate,0.072193
4,coarseaggregate,0.081826
5,age,0.062883


This shows that we can only take the variables ['fineaggregate','coarseaggregate','age'] . since VIF values of these columns are less than 10%(it's not good to take columns having value gretter than 10%)

In [25]:
x = df[['fineaggregate','coarseaggregate','age']].values
y = df[['csMPa']].values

In [26]:
rc = RobustScaler()

In [27]:
rc.fit_transform(x)

array([[-1.06230848,  0.72727273,  0.        ],
       [-1.06230848,  0.87878788,  0.        ],
       [-1.89989785, -0.36363636,  4.93877551],
       ...,
       [ 0.        , -0.76363636,  0.        ],
       [ 0.09090909,  0.21818182,  0.        ],
       [-0.18896834, -1.04545455,  0.        ]])

In [28]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=100,shuffle=True)

In [29]:
model = RandomForestRegressor(random_state=10)

### fitting to the model

In [30]:
model.fit(x_train,y_train)

In [31]:
y_pred = model.predict(x_test)

### Training accuracy

In [32]:
model.score(x_train,y_train)

0.9120300682408092

### Testing accuracy

In [33]:
model.score(x_test,y_test)

0.6773505968053766

### r2_score

In [34]:
r2_score(y_test,y_pred)

0.6773505968053766