# Multi Collinearity

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tabulate
import statsmodels.api as sm

In [2]:
y=np.array([4,7,10,12,18,23])
x1=np.array([2,4,6,8,10,12])
x2=np.array([10,10,12,12,14,14])
x3=np.array([1,3,7,5,11,9])
x4=np.array([10,8,8,4,4,2])
indep=np.column_stack((x1,x2,x3,x4))

In [3]:
indep

array([[ 2, 10,  1, 10],
       [ 4, 10,  3,  8],
       [ 6, 12,  7,  8],
       [ 8, 12,  5,  4],
       [10, 14, 11,  4],
       [12, 14,  9,  2]])

### $\bar{x_{i}}$

In [4]:
x1m=np.mean(x1)
x2m=np.mean(x2)
x3m=np.mean(x3)
x4m=np.mean(x4)
means=np.column_stack((x1m,x2m,x3m,x4m))

### $x_{i}-\bar{x_{i}}$

In [5]:
x1s=x1-x1m
x2s=x2-x2m
x3s=x3-x3m
x4s=x4-x4m
A=np.column_stack((x1s,x2s,x3s,x4s))

In [6]:
A

array([[-5., -2., -5.,  4.],
       [-3., -2., -3.,  2.],
       [-1.,  0.,  1.,  2.],
       [ 1.,  0., -1., -2.],
       [ 3.,  2.,  5., -2.],
       [ 5.,  2.,  3., -4.]])

### $W=A^{'}A$

In [7]:
W=A.T.dot(A)

In [8]:
W

array([[ 70.,  32.,  62., -56.],
       [ 32.,  16.,  32., -24.],
       [ 62.,  32.,  70., -44.],
       [-56., -24., -44.,  48.]])

### $Q=\begin{bmatrix}1/\sqrt{W_{11}}&0&...&0\cr0&1/\sqrt{W_{22}}&...&0\cr\vdots&\vdots&\vdots&\vdots\cr0&0&0&1/\sqrt{W_{kk}}\end{bmatrix}$

In [9]:
def Q():
    a=1/np.sqrt(np.diag(W))
    b=np.diagflat(a)
    return b

In [10]:
q=Q()

In [11]:
q

array([[0.11952286, 0.        , 0.        , 0.        ],
       [0.        , 0.25      , 0.        , 0.        ],
       [0.        , 0.        , 0.11952286, 0.        ],
       [0.        , 0.        , 0.        , 0.14433757]])

### $ R=QWQ$

R is represent to correlation matrix. It is symmetric and its diagonal elements are one. The elements which except from diagonal give the correlation coefficients between independent variables.

In [12]:
R=((q.dot(W)).dot(q))

In [13]:
R

array([[ 1.        ,  0.95618289,  0.88571429, -0.96609178],
       [ 0.95618289,  1.        ,  0.95618289, -0.8660254 ],
       [ 0.88571429,  0.95618289,  1.        , -0.75907212],
       [-0.96609178, -0.8660254 , -0.75907212,  1.        ]])

In [14]:
np.linalg.det(R)

0.00013605442176870867

### To Calculate Multi Collinearity With Scipy

In [15]:
from scipy import stats as st

In [16]:
res=[]
def mult_cor(a):
    for i in range(0,a.shape[1],1):
        for j in range(0,a.shape[1],1):
            results=st.pearsonr(a[:,i],a[:,j])
            res.append(results)

In [17]:
mult_cor(indep)

In [18]:
def tabulate_r(res,a):
    sarr=np.array_split(res,a.shape[1])
    for i in range(0,a.shape[1],1):
        print(sarr[i][:,0])

#### Correlation Matrix

In [19]:
tabulate_r(res,indep)

[ 1.          0.95618289  0.88571429 -0.96609178]
[ 0.95618289  1.          0.95618289 -0.8660254 ]
[ 0.88571429  0.95618289  1.         -0.75907212]
[-0.96609178 -0.8660254  -0.75907212  1.        ]


In [20]:
def tabulate_p(res,a):
    sarr=np.array_split(res,a.shape[1])
    for i in range(0,a.shape[1],1):
        print(sarr[i][:,1])

#### P Values Of Correlation Coefficients

In [21]:
tabulate_p(res,indep)

[1.84889275e-32 2.83784593e-03 1.88454810e-02 1.70515748e-03]
[0.00283785 0.         0.00283785 0.02572142]
[1.88454810e-02 2.83784593e-03 1.84889275e-32 8.00768889e-02]
[0.00170516 0.02572142 0.08007689 0.        ]


### Some Indicaders of Multi Collinearity

In [22]:
y=np.array([10,15,16,19,20,24,26,30,35,38,40,45,48,50,55])
x1=np.arange(2,32,2)
x2=np.array([25,24,23,22,20,19,18,17,16,15,10,9,8,6,4])
x3=np.array([8,1,15,16,18,20,22,25,29,30,35,36,39,40,45])

In [23]:
X=np.column_stack((x1,x2,x3))

In [24]:
Xx=X.T.dot(X)

$det(X^{'}X)=0 ~~\to$ full multi collinearity <br>
$det(X^{'}X) \cong{0} ~~\to$ strong multi collinearity

In [25]:
np.linalg.det(Xx)

1147979384.0000157

### To Calculate Multi Collinearity With Eigenvalues

$\sum^{k+1}_{j=1}\frac{1}{\lambda_{j}}>30~~\to$ Multi collinearity

In [26]:
X

array([[ 2, 25,  8],
       [ 4, 24,  1],
       [ 6, 23, 15],
       [ 8, 22, 16],
       [10, 20, 18],
       [12, 19, 20],
       [14, 18, 22],
       [16, 17, 25],
       [18, 16, 29],
       [20, 15, 30],
       [22, 10, 35],
       [24,  9, 36],
       [26,  8, 39],
       [28,  6, 40],
       [30,  4, 45]])

In [27]:
def R(x):
    means = np.mean(x,axis=0)
    X= np.subtract(x,means)
    W=X.T.dot(X)
    a=1/np.sqrt(np.diag(W))
    Q=np.diagflat(a)
    return ((Q.dot(W)).dot(Q))

In [28]:
cor_mat=R(X)

In [29]:
cor_mat

array([[ 1.        , -0.98695878,  0.97991304],
       [-0.98695878,  1.        , -0.96624006],
       [ 0.97991304, -0.96624006,  1.        ]])

In [30]:
eig=np.linalg.eig(cor_mat)[0]

In [31]:
eig

array([2.95543309, 0.0101127 , 0.03445421])

In [32]:
print(f'{np.sum(1/eig)} -----> >30')

128.24794309638176 -----> >30


## $K=\frac{\lambda_{max}}{\lambda_{min}}$ <br>
$K<=100;~~$ There is no multi collinearity<br>
$100<K<1000;~~ $ Moderately multi collinearity <br>
$1000<=K;~~$ Strong multi collinearity

In [33]:
maxV=np.max(eig)
minV=np.min(eig)

In [34]:
K=maxV/minV

In [35]:
print(f'292,24 ----> moderately multi collinearity')

292,24 ----> moderately multi collinearity


### $K=\sqrt{\frac{\lambda_{max}}{\lambda_{i}}}$<br>
if K > 10, that indicates strong multi collinearity

In [36]:
print(f'K index values{np.max(eig)/eig}')

K index values[  1.         292.24963929  85.77857533]
