In [42]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import chi2_contingency

class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)

df = pd.read_csv('C:\\Users\\ASUS\\Anaconda\\Heart Disease\\cleveland.csv', sep = ',', names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'])
df.dropna(subset = ["ca"], inplace=True)
df.dropna(subset = ["thal"], inplace=True)

#Initialize ChiSquare Class
cT = ChiSquare(df)

#Feature Selection
testColumns = [ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal',]
for var in testColumns:
    cT.TestIndependence(colX=var,colY="num" )  

age is NOT an important predictor. (Discard age from model)
sex is IMPORTANT for Prediction
cp is IMPORTANT for Prediction
trestbps is NOT an important predictor. (Discard trestbps from model)
chol is NOT an important predictor. (Discard chol from model)
fbs is NOT an important predictor. (Discard fbs from model)
restecg is IMPORTANT for Prediction
thalach is IMPORTANT for Prediction
exang is IMPORTANT for Prediction
oldpeak is IMPORTANT for Prediction
slope is IMPORTANT for Prediction
ca is IMPORTANT for Prediction
thal is IMPORTANT for Prediction
