In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [None]:
capped_data = pd.read_csv('capped_data.csv')
capped_data.head()

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,MedianAge,MedianAgeMale,...,state_ South Dakota,state_ Tennessee,state_ Texas,state_ Utah,state_ Vermont,state_ Virginia,state_ Washington,state_ West Virginia,state_ Wisconsin,state_ Wyoming
0,1397.0,330.5,164.9,489.8,61898.0,154151.5,11.2,209.126941,39.3,36.9,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,173.0,70.0,161.3,411.6,48127.0,43269.0,18.6,23.111234,33.0,32.2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,102.0,50.0,174.7,349.7,49348.0,21026.0,14.6,47.560164,45.0,44.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,427.0,202.0,194.8,430.4,44243.0,75882.0,17.1,209.126941,42.8,42.2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,57.0,26.0,144.4,350.1,49955.0,10321.0,12.5,0.0,48.3,47.8,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
capped_data.shape

(3047, 1904)

In [None]:
import warnings
warnings.filterwarnings("ignore")

# This function will first take correlation between columns and return highly correlated columns, more than 80%

In [None]:
def correlation_among_numeric_features(df, cols):
  numeric_col = capped_data[cols]
  corr = numeric_col.corr()

  corr_features = set()
  for i in range(len(corr.columns)):
    for j in range(i):
      if abs(corr.iloc[i,j]) > 0.8:
        colname = corr.columns[i]
        corr_features.add(colname)

  return corr_features

In [None]:
corr_features = correlation_among_numeric_features(capped_data, capped_data.columns)
print(corr_features)

{'lower_bound', 'povertyPercent', 'PctPrivateCoverage', 'PctBlack', 'upper_bound', 'MedianAgeFemale', 'state_ District of Columbia', 'median', 'popEst2015', 'PctPrivateCoverageAlone', 'PctPublicCoverageAlone', 'PctEmpPrivCoverage', 'PctMarriedHouseholds', 'MedianAgeMale'}


In [None]:
highy_corr_cols = ['PctEmpPrivCoverage', 'PctBlack', 'povertyPercent',
                   'lower_bound', 'state_ District of Columbia', 'median',
                   'MedianAgeFemale', 'PctMarriedHouseholds', 'upper_bound',
                   'PctPrivateCoverageAlone', 'PctPrivateCoverage', 'popEst2015',
                   'MedianAgeMale', 'PctPublicCoverageAlone']

cols = [col for col in capped_data.columns if col not in highy_corr_cols]
len(cols)

1890

In [None]:
from sklearn.model_selection import train_test_split

def split_data(dataframe, target_column):
  X = dataframe.drop(target_column, axis = 1)
  y = dataframe[target_column]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
  return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split_data(capped_data[cols], "TARGET_deathRate")

In [None]:
import statsmodels.api as sm

# Now I create Linear Model with constant, to multiply every rows with bias equal one

In [None]:
def lr_model(X_train, y_train):
  X_train_with_intercept = sm.add_constant(X_train)
  lr = sm.OLS(y_train, X_train_with_intercept).fit()
  return lr

# Summary gives an extensive description about the regression results

In [None]:
lr = lr_model(X_train, y_train)
summary = lr.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:       TARGET_deathRate   R-squared:                       0.863
Model:                            OLS   Adj. R-squared:                  0.599
Method:                 Least Squares   F-statistic:                     3.272
Date:                Thu, 29 Jun 2023   Prob (F-statistic):           2.50e-73
Time:                        22:07:06   Log-Likelihood:                -9126.6
No. Observations:                2437   AIC:                         2.146e+04
Df Residuals:                     833   BIC:                         3.076e+04
Df Model:                        1603                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------

# I need to extract some informations from my summary, so I choose them from my linear model, p-value, r-squared value, adjusted r-squared value

# And I define significant variable, setting threshold of p-value as 0.05, so we reject null hipothesis. If it is less it is significant variable

In [None]:
def identify_significant_vars(lr, p_value_threshold = 0.05):
  print(lr.pvalues)
  print(lr.rsquared)
  print(lr.rsquared_adj)

  significant_vars = [var for var in lr.pvalues.keys() if lr.pvalues[var] < p_value_threshold]
  return significant_vars

# I deal with this messy data and get 136 columns left

In [None]:
significant_vars = identify_significant_vars(lr)
print(len(significant_vars))

const                   9.731519e-06
avgAnnCount             6.717864e-01
avgDeathsPerYear        1.137497e-01
incidenceRate           9.942575e-28
medIncome               7.807641e-01
                            ...     
state_ Virginia         4.837561e-06
state_ Washington       6.300737e-02
state_ West Virginia    5.907110e-04
state_ Wisconsin        5.167114e-01
state_ Wyoming          7.853907e-01
Length: 1890, dtype: float64
0.8629461361880499
0.5992038268356419
136


# I must pass const column as well

In [None]:
X_train = sm.add_constant(X_train)
lr = lr_model(X_train[significant_vars], y_train)
summary = lr.summary()
summary

0,1,2,3
Dep. Variable:,TARGET_deathRate,R-squared:,0.666
Model:,OLS,Adj. R-squared:,0.649
Method:,Least Squares,F-statistic:,38.23
Date:,"Thu, 29 Jun 2023",Prob (F-statistic):,0.0
Time:,22:24:31,Log-Likelihood:,-10210.0
No. Observations:,2437,AIC:,20660.0
Df Residuals:,2315,BIC:,21370.0
Df Model:,121,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,97.2965,5.223,18.629,0.000,87.054,107.539
incidenceRate,0.2037,0.008,26.754,0.000,0.189,0.219
MedianAge,-0.6902,0.077,-8.982,0.000,-0.841,-0.540
PctBachDeg25_Over,-1.3772,0.096,-14.397,0.000,-1.565,-1.190
PctPublicCoverage,0.8595,0.067,12.754,0.000,0.727,0.992
country_Aleutians West Census Area,69.5165,17.267,4.026,0.000,35.657,103.376
country_Ashland County,26.4934,11.628,2.278,0.023,3.690,49.296
country_Atkinson County,33.7752,16.423,2.057,0.040,1.569,65.981
country_Atoka County,-52.7315,16.523,-3.191,0.001,-85.133,-20.330

0,1,2,3
Omnibus:,100.471,Durbin-Watson:,2.045
Prob(Omnibus):,0.0,Jarque-Bera (JB):,261.85
Skew:,0.182,Prob(JB):,1.38e-57
Kurtosis:,4.564,Cond. No.,3.06e+22
