## Multicollinearity In Linear Regression

[Multicollinearity article on Analyticsvidhya](https://www.analyticsvidhya.com/blog/2020/03/what-is-multicollinearity/)

In [8]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import statsmodels.api as sm
df_advertising = pd.read_csv('Advertising.csv', index_col=0)
X = df_advertising[['TV', 'radio','newspaper']]
y = df_advertising['sales']
df_advertising.head()


Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
X = sm.add_constant(X)

In [5]:
X

Unnamed: 0,const,TV,radio,newspaper
1,1.0,230.1,37.8,69.2
2,1.0,44.5,39.3,45.1
3,1.0,17.2,45.9,69.3
4,1.0,151.5,41.3,58.5
5,1.0,180.8,10.8,58.4
...,...,...,...,...
196,1.0,38.2,3.7,13.8
197,1.0,94.2,4.9,8.1
198,1.0,177.0,9.3,6.4
199,1.0,283.6,42.0,66.2


- [Ordinary Least Squares](https://www.statsmodels.org/stable/examples/notebooks/generated/ols.html)
- [statsmodels package](https://pypi.org/project/statsmodels/)

In [6]:
## fit a OLS model with intercept on TV and Radio   [Ordinary Least square]

model= sm.OLS(y, X).fit()


In [7]:
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,1.58e-96
Time:,20:41:35,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [10]:

X.iloc[:,1:].corr()


Unnamed: 0,TV,radio,newspaper
TV,1.0,0.054809,0.056648
radio,0.054809,1.0,0.354104
newspaper,0.056648,0.354104,1.0


In [11]:
df_salary = pd.read_csv('Salary_Data.csv')
df_salary.head()

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891


In [12]:
X = df_salary[['YearsExperience', 'Age']]
y = df_salary['Salary']

In [13]:
## fit a OLS model with intercept on TV and Radio
X = sm.add_constant(X)
X

Unnamed: 0,const,YearsExperience,Age
0,1.0,1.1,21.0
1,1.0,1.3,21.5
2,1.0,1.5,21.7
3,1.0,2.0,22.0
4,1.0,2.2,22.2
5,1.0,2.9,23.0
6,1.0,3.0,23.0
7,1.0,3.2,23.3
8,1.0,3.2,23.3
9,1.0,3.7,23.6


In [14]:
model= sm.OLS(y, X).fit()

In [15]:
model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.96
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,323.9
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,1.35e-19
Time:,20:44:44,Log-Likelihood:,-300.35
No. Observations:,30,AIC:,606.7
Df Residuals:,27,BIC:,610.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6661.9872,2.28e+04,-0.292,0.773,-5.35e+04,4.02e+04
YearsExperience,6153.3533,2337.092,2.633,0.014,1358.037,1.09e+04
Age,1836.0136,1285.034,1.429,0.165,-800.659,4472.686

0,1,2,3
Omnibus:,2.695,Durbin-Watson:,1.711
Prob(Omnibus):,0.26,Jarque-Bera (JB):,1.975
Skew:,0.456,Prob(JB):,0.372
Kurtosis:,2.135,Cond. No.,626.0


In [15]:
X.iloc[:,1:].corr()

Unnamed: 0,YearsExperience,Age
YearsExperience,1.0,0.987258
Age,0.987258,1.0


In [16]:
df_salary = pd.read_csv('Salary.csv')
df_salary.head()

Unnamed: 0,Gender,Age,Years of service,Education level,Salary
0,0.0,27.0,1.7,0.0,39343.0
1,1.0,26.0,1.1,1.0,43205.0
2,1.0,26.0,1.2,0.0,47731.0
3,0.0,27.0,1.6,1.0,46525.0
4,0.0,26.0,1.5,1.0,40891.0


In [19]:
X = df_salary.drop("Salary" , axis=1)
y = df_salary['Salary']

In [24]:
X=X.iloc[:23,:]
X


Unnamed: 0,Gender,Age,Years of service,Education level
0,0.0,27.0,1.7,0.0
1,1.0,26.0,1.1,1.0
2,1.0,26.0,1.2,0.0
3,0.0,27.0,1.6,1.0
4,0.0,26.0,1.5,1.0
5,1.0,28.0,2.3,0.0
6,0.0,32.0,2.8,1.0
7,1.0,26.0,1.3,2.0
8,0.0,31.0,3.0,1.0
9,1.0,26.0,1.5,1.0


In [26]:
y=y.iloc[:23]
y


0     39343.0
1     43205.0
2     47731.0
3     46525.0
4     40891.0
5     56642.0
6     60150.0
7     54445.0
8     64445.0
9     57189.0
10    55124.0
11    56452.0
12    59332.0
13    45989.0
14    49243.0
15    52787.0
16    53654.0
17    58423.0
18    56887.0
19    67189.0
20    63218.0
21    55794.0
22    56957.0
Name: Salary, dtype: float64

In [27]:
X = sm.add_constant(X)
X

Unnamed: 0,const,Gender,Age,Years of service,Education level
0,1.0,0.0,27.0,1.7,0.0
1,1.0,1.0,26.0,1.1,1.0
2,1.0,1.0,26.0,1.2,0.0
3,1.0,0.0,27.0,1.6,1.0
4,1.0,0.0,26.0,1.5,1.0
5,1.0,1.0,28.0,2.3,0.0
6,1.0,0.0,32.0,2.8,1.0
7,1.0,1.0,26.0,1.3,2.0
8,1.0,0.0,31.0,3.0,1.0
9,1.0,1.0,26.0,1.5,1.0


In [28]:
model= sm.OLS(y, X).fit()

In [29]:
model.summary()

0,1,2,3
Dep. Variable:,Salary,R-squared:,0.576
Model:,OLS,Adj. R-squared:,0.482
Method:,Least Squares,F-statistic:,6.124
Date:,"Thu, 24 Jun 2021",Prob (F-statistic):,0.00272
Time:,20:53:54,Log-Likelihood:,-226.97
No. Observations:,23,AIC:,463.9
Df Residuals:,18,BIC:,469.6
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4259.3593,1.67e+04,0.255,0.802,-3.09e+04,3.94e+04
Gender,4131.8177,2227.532,1.855,0.080,-548.054,8811.689
Age,1639.5192,726.269,2.257,0.037,113.684,3165.354
Years of service,-372.5818,2386.475,-0.156,0.878,-5386.380,4641.216
Education level,1271.7183,1581.921,0.804,0.432,-2051.775,4595.211

0,1,2,3
Omnibus:,0.142,Durbin-Watson:,0.913
Prob(Omnibus):,0.932,Jarque-Bera (JB):,0.345
Skew:,-0.113,Prob(JB):,0.842
Kurtosis:,2.444,Cond. No.,444.0


In [30]:
X.iloc[:,1:].corr()

Unnamed: 0,Gender,Age,Years of service,Education level
Gender,1.0,0.055125,0.027458,0.015954
Age,0.055125,1.0,0.868708,0.066057
Years of service,0.027458,0.868708,1.0,0.170843
Education level,0.015954,0.066057,0.170843,1.0



#### 1.So we can drop age feature  or Years of Service 
#### 2.or Keep it as usual
