# DATA EXPLORATION

In [1]:
#libraries required for data exploration
import numpy as np
import pandas as pd
from time import time

In [2]:
#reading data through pandas 
data = pd.read_csv("combined_dataset.csv")
orgdata=data.copy()
data.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,num
0,67,1,4,160.0,286.0,0.0,2.0,108.0,1.0,2
1,67,1,4,120.0,229.0,0.0,2.0,129.0,1.0,1
2,37,1,3,130.0,250.0,0.0,0.0,187.0,0.0,0
3,41,0,2,130.0,204.0,0.0,2.0,172.0,0.0,0
4,56,1,2,120.0,236.0,0.0,0.0,178.0,0.0,0


In [3]:
data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,num
911,54,0,4,127.0,333.0,1.0,1.0,154.0,0.0,1
912,62,1,1,130.0,139.0,0.0,1.0,140.0,0.0,0
913,55,1,4,122.0,223.0,1.0,1.0,100.0,0.0,2
914,58,1,4,130.0,385.0,1.0,2.0,140.0,0.0,0
915,62,1,2,120.0,254.0,0.0,2.0,93.0,1.0,1


In [4]:
#checking all column names
cols=data.columns.tolist()
cols

['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'num']

In [5]:
#information regarding data in the table
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       916 non-null    int64  
 1   sex       916 non-null    int64  
 2   cp        916 non-null    int64  
 3   trestbps  916 non-null    float64
 4   chol      916 non-null    float64
 5   fbs       916 non-null    float64
 6   restecg   916 non-null    float64
 7   thalach   916 non-null    float64
 8   exang     916 non-null    float64
 9   num       916 non-null    int64  
dtypes: float64(6), int64(4)
memory usage: 71.7 KB


In [6]:
#To check number of null values for each column
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
num         0
dtype: int64

# DATA PREPROCESSING

# 1.Conversion of categeorical variables to numerical

In [7]:
#class with function to encode categeorical variable containing columns with Null values
class label_encoder_contain_missing_values :

        def __init__ (self) :    
            pass  

        def categorical_to_numeric (self,dataset):
            import numpy as np
            import pandas as pd
            
            self.dataset = dataset
            self.summary = None
            self.table_encoder= {}

            for index in self.dataset.columns :
                if self.dataset[index].dtypes == 'object' :               
                   column_data_frame = pd.Series(self.dataset[index],name='column').to_frame()
                   unique_values = pd.Series(self.dataset[index].unique())
                   i = 0
                   label_encoder = pd.DataFrame({'value_name':[],'Encode':[]})
                   while i <= len(unique_values)-1:
                         if unique_values.isnull()[i] == True : 
                           label_encoder = label_encoder.append({'value_name': unique_values[i],'Encode':np.nan}, ignore_index=True) #np.nan = -1
                         else:
                           label_encoder = label_encoder.append({'value_name': unique_values[i],'Encode':i}, ignore_index=True)
                         i+=1 

                   output = pd.merge(left=column_data_frame,right = label_encoder, how='left',left_on='column',right_on='value_name')
                   self.summary = output[['column','Encode']].drop_duplicates().reset_index(drop=True)
                   self.dataset[index] = output.Encode 
                   self.table_encoder.update({index:self.summary})
                    
                else :
                     pass
                     
            # ---- Show Encode Table ----- #               
            print('''\nLabel Encoding completed in Successfully.\n
                       Next steps: \n
                       1.  To view table_encoder, Execute the follow: \n
                           for index in table_encoder :
                           print(f'\\n{index} \\n',table_encoder[index])
                           
                       2. For inverse, execute the follow : \n
                          df = label_encoder_contain_missing_values().
                               inverse_numeric_to_categorical(table_encoder, df) ''') 
                        
            return self.table_encoder  ,self.dataset 
        

        def inverse_numeric_to_categorical (self,table_encoder, df):
            dataset = df.copy()
            for column in table_encoder.keys():
                df_column = df[column].to_frame()
                output = pd.merge(left=df_column,right = table_encoder[column], how='left',left_on= column,right_on='Encode')#.rename(columns={'column_x' :'encode','column_y':'category'})
                df[column]= output.column
            print('\nInverse Label Encoding, from categorical to numerical completed in Successfully.\n')
            return df

In [8]:
#using if condition to check if there are null values in the data
if(data.isnull().sum().tolist()!=([0]*len(cols))):
    #encoding our data with categeorical columns with null values using our bespoke class
    table_encoder,data  = label_encoder_contain_missing_values().categorical_to_numeric(data)
else:
    #To  convert categeorical data to numerical data when there are no null values in categeorical data using inbuilt method
    from sklearn import preprocessing
    label_encoder = preprocessing.LabelEncoder()
    for i in cols:
        if (data[i].dtype in [np.str_,np.byte,np.ubyte]):
            data[i]=label_encoder.fit_transform(data[i])
    print("Successfully used Label Encoder")        
    
    
    


Successfully used Label Encoder


In [9]:
#View of data will all numerical variables
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,num
0,67,1,4,160.0,286.0,0.0,2.0,108.0,1.0,2
1,67,1,4,120.0,229.0,0.0,2.0,129.0,1.0,1
2,37,1,3,130.0,250.0,0.0,0.0,187.0,0.0,0
3,41,0,2,130.0,204.0,0.0,2.0,172.0,0.0,0
4,56,1,2,120.0,236.0,0.0,0.0,178.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...
911,54,0,4,127.0,333.0,1.0,1.0,154.0,0.0,1
912,62,1,1,130.0,139.0,0.0,1.0,140.0,0.0,0
913,55,1,4,122.0,223.0,1.0,1.0,100.0,0.0,2
914,58,1,4,130.0,385.0,1.0,2.0,140.0,0.0,0


# 2.To Perform Imputation

In [13]:
#To ask the user wht type o imputaion is to be done and imputation is done accordingly
while True:
    imputation=str(input())
    if(imputation=="median"):
        try:
            for i in cols:
                if(data[i].isnull().sum().tolist()!=[0]):
                    imp=data[i].median()
                    data[i].fillna(imp)
            print("successful")        
            break        
        except:
            print("Choose another imputaion")
            continue
    elif(imputation=="mode"):
        try:
            for i in cols:
                if(data[i].isnull().sum().tolist()!=[0]):
                    imp=data[i].mode()
                    data[i].fillna(imp)
            print("successful")           
            break        
        except:
            print("choose another imputaion")
            continue
    elif(imputation=="mean"):
        try:
            for i in cols:
                if(data[i].isnull().sum().tolist()!=[0]):
                    imp=data[i].mean()
                    data[i].fillna(imp)
            print("successful")           
            break        
        except:
            print("choose another imputaion")
            continue
    elif(imputation=="simpleimputer") :
        try:
            while True:
                strat=str(input("enter the strategy"))
                try:
                    from sklearn.impute import SimpleImputer
                    imp = SimpleImputer(missing_values=np.nan, strategy=strat)
                    data=pd.DataFrame(imp.fit_transform(data))
                    data.columns=cols
                    print("successful")   
                    break
                except:
                    print("enter proper strategy")
                    continue
            break        
        except:
            print("enter another imputation")
            continue
    elif(imputation=="knnimputer") :
        try:
            while True:
                neighbors=int(input("enter the no of neighbours"))
                try:
                    from sklearn.impute import KNNImputer
                    imp =KNNImputer(n_neighbors=neighbors)
                    data=pd.DataFrame(imp.fit_transform(data))
                    data.columns=cols
                    print("successful")   
                    break
                except:
                    print("enter proper no of neighbours")
                    continue
            break
        except:
            print("enter proper imputation")
            continue
    else:
        print("Enter proper imputation")
        pass
            
    
        
        
            

knnimputer
enter the no of neighbours3
successful


In [14]:
#To view data once imputation is done 
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,num
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,2.0
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,1.0
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,0.0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,0.0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
911,54.0,0.0,4.0,127.0,333.0,1.0,1.0,154.0,0.0,1.0
912,62.0,1.0,1.0,130.0,139.0,0.0,1.0,140.0,0.0,0.0
913,55.0,1.0,4.0,122.0,223.0,1.0,1.0,100.0,0.0,2.0
914,58.0,1.0,4.0,130.0,385.0,1.0,2.0,140.0,0.0,0.0


# 3.Split dependent and independent variables form data

In [15]:
#To seperate independent and dependent elements of the data
while True:
    #By observing the data user gives column name of which is to be considered as target
    target=str(input("enter target column name "))
    X=data.iloc[:,:]
    if(target=="auto"):
        X=data.iloc[:,:-1]
        Y=data.iloc[:,-1]
        target=cols[-1]
        break
    elif(target in cols):
        X=X.drop([target],axis=1)
        Y=data.iloc[:,cols.index(target)]
        break
    else:
        print("enter the correct target variable")
        pass
    

enter target column name auto


In [16]:
#independent variables
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0
...,...,...,...,...,...,...,...,...,...
911,54.0,0.0,4.0,127.0,333.0,1.0,1.0,154.0,0.0
912,62.0,1.0,1.0,130.0,139.0,0.0,1.0,140.0,0.0
913,55.0,1.0,4.0,122.0,223.0,1.0,1.0,100.0,0.0
914,58.0,1.0,4.0,130.0,385.0,1.0,2.0,140.0,0.0


In [17]:
#dependent variables
Y

0      2.0
1      1.0
2      0.0
3      0.0
4      0.0
      ... 
911    1.0
912    0.0
913    2.0
914    0.0
915    1.0
Name: num, Length: 916, dtype: float64

# 4.To select columns with significant correlation 

In [20]:
#To find out corelation of each column on the output

Corelationlist=data.corr()[target].abs().tolist()
data.corr()[target].abs().sort_values(ascending=False)

#To selecet elements with co-relation value greater that the specifie values
Corr_Threshold=float(input("Enter the minimum corelation to be considered"))

for i in range(len(Corelationlist)):
    if(Corelationlist[i]<Corr_Threshold):
        print(cols[i],"Removed")
        data.drop(cols[i],axis='columns', inplace=True)
print(data)

Enter the minimum corelation to be considered0.01
      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  num
0    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0  2.0
1    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0  1.0
2    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0  0.0
3    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0  0.0
4    56.0  1.0  2.0     120.0  236.0  0.0      0.0    178.0    0.0  0.0
..    ...  ...  ...       ...    ...  ...      ...      ...    ...  ...
911  54.0  0.0  4.0     127.0  333.0  1.0      1.0    154.0    0.0  1.0
912  62.0  1.0  1.0     130.0  139.0  0.0      1.0    140.0    0.0  0.0
913  55.0  1.0  4.0     122.0  223.0  1.0      1.0    100.0    0.0  2.0
914  58.0  1.0  4.0     130.0  385.0  1.0      2.0    140.0    0.0  0.0
915  62.0  1.0  2.0     120.0  254.0  0.0      2.0     93.0    1.0  1.0

[916 rows x 10 columns]


# 5.Splitting Data into Training and testing sets

In [22]:
#splitting into training an dtesting data
Test_size=float(input("Enter the size of test data"))
from sklearn.model_selection import train_test_split
X_train,X_test, Y_train, Y_test = train_test_split(X, Y, test_size =Test_size, random_state = 0)

Enter the size of test data0.7


# 6.Scaling the independent variables

In [23]:
#Scaling the data to a smaller value range using standard scaler
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test)

# FUNCTIONS TO CHECK DIFFERENT METRICS OF EVALUATION

In [25]:
def acc(pred):
    from sklearn.metrics import accuracy_score
    acc=accuracy_score(pred,Y_test)
    return acc*100

# IMPLEMENTING DIFFERENT ML ALGORITHMS

In [27]:
#importing linear regression
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train,Y_train)
pred=LR.predict(X_test)
LRA=acc(pred)
print("logistic regression accuracy=",acc(pred))


from sklearn.neighbors import KNeighborsClassifier
KNN= KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train,Y_train)
pred=KNN.predict(X_test) 
KNNA=acc(pred)
print("KNN accuracy=",acc(pred))


from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=0)
DT.fit(X_train,Y_train)
pred=DT.predict(X_test)
DTA=acc(pred)
print("DT accuracy=",acc(pred))


from sklearn import svm
SVM=svm.SVC(probability=True)
SVM.fit(X_train,Y_train)
pred=SVM.predict(X_test)
SVMA=acc(pred)
print("SVM accuracy=",acc(pred))


from sklearn.naive_bayes import GaussianNB
GNB= GaussianNB()
pred= GNB.fit(X_train, Y_train).predict(X_test)
GNBA=acc(pred)
print("GNB accuracy=",acc(pred))


from sklearn.ensemble import RandomForestClassifier
RFC= RandomForestClassifier(max_depth=2, random_state=0)
pred= RFC.fit(X_train, Y_train).predict(X_test)
RFCA=acc(pred)
print("RFC accuracy=",acc(pred))


from sklearn.ensemble import GradientBoostingClassifier
GBC= GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
pred= GBC.fit(X_train, Y_train).predict(X_test)
GBCA=acc(pred)
print("GBC accuracy=",acc(pred))


from sklearn.ensemble import AdaBoostClassifier
ABC= AdaBoostClassifier(n_estimators=100, random_state=0)
pred= ABC.fit(X_train, Y_train).predict(X_test)
ABCA=acc(pred)
print("ABC accuracy=",acc(pred))


from sklearn.neural_network import MLPClassifier
MLP= MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
pred= MLP.fit(X_train, Y_train).predict(X_test)
MLPA=acc(pred)
print("MLP accuracy=",acc(pred))


# hard voting classifier
from sklearn.ensemble import VotingClassifier
VCH= VotingClassifier(estimators=[('10',LR),('11',KNN),('12',DT),('13',SVM),('14',GNB),('15',RFC),('16',GBC),('17',ABC),('18',MLP)], voting='hard')
pred= VCH.fit(X_train, Y_train).predict(X_test)
VCHA=acc(pred)
print("VCH accuracy=",acc(pred))


# soft voting classifier
from sklearn.ensemble import VotingClassifier
VCS= VotingClassifier(estimators=[('1',LR),('2',KNN),('3',DT),('4',SVM),('5',GNB),('6',RFC),('7',GBC),('8',ABC),('9',MLP)], voting='soft')
pred= VCS.fit(X_train, Y_train).predict(X_test)
VCSA=acc(pred)
print("VCS accuracy=",acc(pred))


#xgboost algorithm
import xgboost as xgb
from sklearn.metrics import r2_score
XGB = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
pred= XGB.fit(X_train, Y_train).predict(X_test)
XGBA=r2_score(Y_test,pred)*100
print("XGBA accuracy=",XGBA)
#Grid search cv algorithm
#To be written after discussion





logistic regression accuracy= 54.361370716510905
KNN accuracy= 51.71339563862928
DT accuracy= 46.41744548286604
SVM accuracy= 53.58255451713395
GNB accuracy= 35.51401869158878
RFC accuracy= 55.45171339563863
GBC accuracy= 49.68847352024922
ABC accuracy= 32.55451713395638


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLP accuracy= 53.58255451713395


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


VCH accuracy= 54.67289719626168


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


VCS accuracy= 53.11526479750779
XGBA accuracy= 6.895176591119911


# Finding the One with Best Accuracy

In [28]:
Accuracies=[LRA,KNNA,DTA,SVMA,GNBA,RFCA,GBCA,ABCA,MLPA,VCHA,VCSA,XGBA]
Accuracy_labels=['LR','KNN','DT','SVM','GNB','RFC','GBC','ABC','MLP','VCH','VCS','XGB']
Models=[LR,KNN,DT,SVM,GNB,RFC,GBC,ABC,MLP,VCH,VCS,XGB]
Max_accuracy=Accuracy_labels[Accuracies.index(max(Accuracies))]
Model=Models[Accuracies.index(max(Accuracies))]
print(Max_accuracy,"\nwith acc =",max(Accuracies),'%')

RFC 
with acc = 55.45171339563863 %


# IMPLEMENTING ANN ALGORITHMS BASED ON USERS HYPER PARAMETERS

# 1.simple Ann

In [29]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LeakyReLU
from keras.layers import Dropout


In [32]:
Ann=Sequential()
#No of layers
n=int(input("Enter the number of layers"))
for i in range(n):
    print("layer",i+1)    
    #no of units in each layer
    u=int(input("enter the no of units"))
    #kernel initializer
    init=str(input("enter the weight initialization function"))
    #activation function
    act=str(input("enter the activation function"))
    if(i==0):
        dim=len(X.columns.tolist())
        Ann.add(Dense(units=u,kernel_initializer=init,activation=act,input_dim=dim))
    else:
        Ann.add(Dense(units=u,kernel_initializer=init,activation=act))
    print("\n")    
    
print("ANN GENERATED")        

Enter the number of layers3
layer 1
enter the no of units10
enter the weight initialization functionhe_uniform
enter the activation functionrelu


layer 2
enter the no of units10
enter the weight initialization functionhe_uniform
enter the activation functionrelu


layer 3
enter the no of units9
enter the weight initialization functionhe_uniform
enter the activation functionrelu


ANN GENERATED


In [39]:
opt=str(input("enter the optimzer"))
los=str(input("enter the cost fuction to be used"))
met=str(input("enter the metric of valuaton"))
Ann.compile(optimizer=opt,loss=los,metrics=[met])
print("ANN COMPILED")

enter the optimzeradam
enter the cost fuction to be usedbinary_crossentropy
enter the metric of valuatonaccuracy
ANN COMPILED


In [None]:
ep=int(input("enter no of epochs"))
bs=int(input("enter the batch size"))
split=int(input("enter the % of data for valdation"))
model_history=Ann.fit(X_train, Y_train,epochs=ep,batch_size=bs,validation_split=(split/100))

enter no of epochs100
enter the batch size20


In [None]:
pred=Ann.predict(X_test)
pred=(pred>0.5)
AnnA=acc(pred)
print("Ann accuracy=",acc(pred))

# 2.Keras Tuner 


In [None]:
#Give the list of units t be considered
n=int(input("enter the no of values to be selected as units"))
l=[]
for i in range(n):
    l.append(int(input("enter the number")))

In [None]:
#usig the keras tuner
import keras_tuner as kt
from tensorflow import keras
def build_model(hp):
  model = keras.Sequential()
  model.add(keras.layers.Dense(
      hp.Choice('units',l),
      activation='relu'))
  model.add(keras.layers.Dense(1, activation='relu'))
  model.compile(loss='mse')
  return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=5)
tuner.search(X_train, Y_train, epochs=5, validation_data=(X_test, Y_test))
Ktann = tuner.get_best_models()[0]

In [None]:
pred=Ktann.predict(X_test)
pred=(pred>0.5)
KtannA=acc(pred)
print("Ktann accuracy=",acc(pred))


# TO CREATE PLOTS BASED ON USER INPUT

In [None]:
#Import vizualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
while True:
    Type_of_plot=str(input())
    if(Type_of_plot=="pairplot"):
        sns.set(style="ticks", color_codes=True)
        sns.pairplot(data)
        plt.show()
        break
    elif(Type_of_plot=="co_relation heat map"):
        heatmap = sns.heatmap(data.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
        heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':10}, pad=12)
        plt.show()
        break
    elif(Type_of_plot=="co_relation bar graph"):
        barcols=X.columns.tolist()
        barcorr=data.corr()
        barcorr=barcorr[target].values.tolist()
        for i in range(len(barcorr)):
            barcorr[i]=abs(barcorr[i])
        plt.bar(barcols,barcorr[:-1])
        plt.title('Bargraph of correlation with '+target)
        plt.xlabel('Attribute')
        plt.show()
        break
    elif(Type_of_plot=="custom"):
        while True:
            x_axis_col=str(input("enter the column name to be compared to target"))
            if(x_axis_col in X.columns.tolist()):
                x_axis=X[x_axis_col]
                y_axis=Y
                plt.bar(x_axis,y_axis)
                plt.grid(True)
                plt.title(target.upper()+"  vs  "+x_axis_col.upper(), fontsize=14)
                plt.xlabel(x_axis_col.upper(), fontsize=14)
                plt.ylabel(target.upper(), fontsize=14)
                plt.show()
                break
            else:
                print("Enter a valid column name")
                pass
        break   
            
    else:
        print("enter correct plot")
        pass
        
    
    
   
    