In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer 
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import scipy.stats as stats
from scipy.stats import chi2_contingency

In [17]:
class ChiSquare:
    def __init__(self, dataframe):
        self.df = dataframe
        self.p = None #P-Value
        self.chi2 = None #Chi Test Statistic
        self.dof = None
        
        self.dfObserved = None
        self.dfExpected = None
        
    def _print_chisquare_result(self, colX, alpha):
        result = ""
        if self.p<alpha:
            result="{0} is IMPORTANT for Prediction".format(colX)
        else:
            result="{0} is NOT an important predictor. (Discard {0} from model)".format(colX)

        print(result)
        
    def TestIndependence(self,colX,colY, alpha=0.05):
        X = self.df[colX].astype(str)
        Y = self.df[colY].astype(str)
        
        self.dfObserved = pd.crosstab(Y,X) 
        chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
        self.p = p
        self.chi2 = chi2
        self.dof = dof 
        
        self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns, index = self.dfObserved.index)
        
        self._print_chisquare_result(colX,alpha)

In [18]:
dataset = pd.read_csv('C:\\Users\\ASUS\\Anaconda\\Heart Disease\\cleveland.csv', sep = ',', names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'])

In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB


In [20]:
dataset.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [21]:
#Mengganti Mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
idf=pd.DataFrame(imputer.fit_transform(dataset))
idf.columns=dataset.columns
idf.index=dataset.index
x = idf.drop('num',axis=1)
y = idf['num']

#imputer = imputer.fit(x[:,0:13])   
#x[:, 0:13] = imputer.transform(x[:, 0:13])

#Menghapus NA
dataset.dropna(subset = ["ca"], inplace=True)
dataset.dropna(subset = ["thal"], inplace=True)
X = dataset.drop('num',axis=1)
Y = dataset['num']

In [22]:
print(idf)

      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
298  45.0  1.0  1.0     110.0  264.0  0.0      0.0    132.0    0.0      1.2   
299  68.0  1.0  4.0     144.0  193.0  1.0      0.0    141.0    0.0      3.4   
300  57.0  1.0  4.0     130.0  131.0  0.0      0.0    115.0    1.0      1.2   
301  57.0  0.0  2.0     130.0  236.0  0.0      2.0    174.0    0.0      0.0   
302  38.0  1.0  3.0     138.0  175.0  0.0      0.0    173.0    0.0      0.0   

     slope        ca  thal  num  
0      3.0  0.000

In [23]:
#Encoding Categorical Data
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
#cp
#ct = ColumnTransformer([("cp",OneHotEncoder(),[2])], remainder = 'passthrough')
#X = ct.fit_transform(X)

#restecg
#ct = ColumnTransformer([("restecg",OneHotEncoder(),[8])], remainder = 'passthrough')
#X = ct.fit_transform(X)

#slope
#ct = ColumnTransformer([("slope",OneHotEncoder(),[13])], remainder = 'passthrough')
#X = ct.fit_transform(X)

#ca
#ct = ColumnTransformer([("ca",OneHotEncoder(),[15])], remainder = 'passthrough')
#X = ct.fit_transform(X)

#thal
#ct = ColumnTransformer([("thal",OneHotEncoder(),[19])], remainder = 'passthrough')
#X = ct.fit_transform(X)

#from sklearn.preprocessing import StandardScaler
#scalerX = StandardScaler()
#X = scalerX.fit_transform(X)

In [24]:
print(X)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
297   57    0   4       140   241    0        0      123      1      0.2   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   

     slope   ca  thal  
0        3  0.0   6.0  
1        2  3.0   3.0  
2        2  2.0

In [25]:
#Initialize ChiSquare Class Mean dataset
cTMean = ChiSquare(idf)

#Feature Selection
testColumns = [ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal',]
for var in testColumns:
    cTMean.TestIndependence(colX=var,colY="num" )



age is NOT an important predictor. (Discard age from model)
sex is IMPORTANT for Prediction
cp is IMPORTANT for Prediction
trestbps is NOT an important predictor. (Discard trestbps from model)
chol is NOT an important predictor. (Discard chol from model)
fbs is NOT an important predictor. (Discard fbs from model)
restecg is IMPORTANT for Prediction
thalach is IMPORTANT for Prediction
exang is IMPORTANT for Prediction
oldpeak is IMPORTANT for Prediction
slope is IMPORTANT for Prediction
ca is IMPORTANT for Prediction
thal is IMPORTANT for Prediction


In [26]:
#Initialize ChiSquare Class Delete dataset
cTDelete = ChiSquare(dataset)

#Feature Selection
testColumns = [ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal',]
for var in testColumns:
    cTDelete.TestIndependence(colX=var,colY="num" )

age is NOT an important predictor. (Discard age from model)
sex is IMPORTANT for Prediction
cp is IMPORTANT for Prediction
trestbps is NOT an important predictor. (Discard trestbps from model)
chol is NOT an important predictor. (Discard chol from model)
fbs is NOT an important predictor. (Discard fbs from model)
restecg is IMPORTANT for Prediction
thalach is IMPORTANT for Prediction
exang is IMPORTANT for Prediction
oldpeak is IMPORTANT for Prediction
slope is IMPORTANT for Prediction
ca is IMPORTANT for Prediction
thal is IMPORTANT for Prediction


In [27]:
from sklearn.model_selection import train_test_split
XTrain, XTest, yTrain, yTest = train_test_split(X, Y, test_size=0.3, random_state=0)

In [28]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
classifier.fit(XTrain,yTrain)
yPred = classifier.predict(XTest)
mse = mean_squared_error(yTest,yPred)
r = r2_score(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
accuracy = accuracy_score(yTest,yPred)
print("K Nearest Neighbors :")
print("Accuracy = ", accuracy)
print("Mean Squared Error:",mse)
print("R score:",r)
print("Mean Absolute Error:",mae)

K Nearest Neighbors :
Accuracy =  0.4888888888888889
Mean Squared Error: 2.2777777777777777
R score: -0.43456962911126684
Mean Absolute Error: 0.9666666666666667
