<a href="https://colab.research.google.com/github/Lee-Minsoo-97/Templates-for-Descrptv-Predctv_Anlyt/blob/main/Multicolinearity_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Muticolinearity

This python file is designed to examine colinearity among explainatory variables in regression analysis.





In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.anova import anova_lm
import matplotlib.pyplot as plt
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [None]:
data = pd.read_excel('Salary Multicollinearity Demonstration.xlsx')
data

Unnamed: 0,Person,Gender,Age,Experience,Seniority,Salary
0,1,Male,56,29,28,126400
1,2,Male,30,5,5,62300
2,3,Male,34,8,4,48500
3,4,Male,43,15,14,87500
4,5,Female,42,15,7,68700
...,...,...,...,...,...,...
295,296,Male,44,18,8,58000
296,297,Male,38,12,9,72400
297,298,Male,47,16,11,70200
298,299,Male,45,18,18,102100


In [None]:
data[['Age','Experience','Seniority','Salary']].corr().round(3)

Unnamed: 0,Age,Experience,Seniority,Salary
Age,1.0,0.984,0.884,0.857
Experience,0.984,1.0,0.905,0.882
Seniority,0.884,0.905,1.0,0.973
Salary,0.857,0.882,0.973,1.0


In [None]:
data[['Age','Experience','Seniority','Salary']].cov().round(3)

Unnamed: 0,Age,Experience,Seniority,Salary
Age,118.344,99.599,68.853,208895.2
Experience,99.599,86.594,60.312,183957.0
Seniority,68.853,60.312,51.242,156062.8
Salary,208895.166,183956.977,156062.758,501868400.0


In [None]:
fullmodel = smf.ols(formula = 'Salary ~ C(Gender, Treatment(reference = "Male")) + Age + Experience + Seniority', data = data).fit()
print(fullmodel.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.952
Model:                            OLS   Adj. R-squared:                  0.951
Method:                 Least Squares   F-statistic:                     1449.
Date:                Mon, 23 Sep 2024   Prob (F-statistic):          1.68e-192
Time:                        08:21:19   Log-Likelihood:                -2976.2
No. Observations:                 300   AIC:                             5962.
Df Residuals:                     295   BIC:                             5981.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------

In [None]:
X = data[[ 'Gender', 'Age', 'Experience', 'Seniority']]
X['Gender'] = X['Gender'].replace({'Male': 0, 'Female': 1})
X = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [None]:
vif_data

Unnamed: 0,feature,VIF
0,const,177.25412
1,Gender,1.001383
2,Age,31.503655
3,Experience,38.15309
4,Seniority,5.594549


In [None]:
def calculate_r_squared_for_multicollinearity(df):
    r_squared_values = {}
    for column in df.columns:
        y = df[column]
        X = df.drop(columns=[column])
        X = sm.add_constant(X)
        model = sm.OLS(y, X).fit()
        r_squared_values[column] = model.rsquared
    return r_squared_values

In [None]:
model_no_coll = smf.ols(formula = 'Salary ~ C(Gender, Treatment(reference = "Male")) + Seniority', data = data).fit()
print(model_no_coll.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.950
Method:                 Least Squares   F-statistic:                     2871.
Date:                Mon, 23 Sep 2024   Prob (F-statistic):          5.33e-195
Time:                        08:21:19   Log-Likelihood:                -2978.4
No. Observations:                 300   AIC:                             5963.
Df Residuals:                     297   BIC:                             5974.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                                       coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------

In [None]:
X = data[[ 'Gender', 'Seniority']]
X['Gender'] = X['Gender'].replace({'Male': 0, 'Female': 1})
X = sm.add_constant(X)
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif_data

Unnamed: 0,feature,VIF
0,const,3.433183
1,Gender,1.001097
2,Seniority,1.001097


In [None]:
calculate_r_squared_for_multicollinearity(X[['Gender','Seniority']])

{'Gender': 0.0010960033515244882, 'Seniority': 0.0010960033515249323}