# Binary Classification Model for Disease Prediction


# Part 1: Installation of necessary libraries

In [2]:
import numpy as np
import pandas as pd
import os 
import random
#import tensorflow
from scipy.spatial import distance
import time


# Part 2: Download of patient data with corresponding disease status

In [3]:
pd.set_option("display.max_columns", None) # This ensures the full dataframe is visulised and not truncated
disease_data = pd.read_csv('Training.csv')

# Separate majority and minority classes
df_majority = disease_data[disease_data.Class==0]
df_minority = disease_data[disease_data.Class==1]

# Get the number of instances in the majority class
majority_count = df_majority.shape[0]

# Oversample minority class to have the same number of instances as the majority class
df_minority_oversampled = df_minority.sample(n=majority_count, replace=True, random_state=42)

# Concatenate the majority class DataFrame and the oversampled minority class DataFrame
disease_data = pd.concat([df_majority, df_minority_oversampled])
print(len(disease_data))

1018


# Part 3 : Exploratory Data Analysis


In [4]:
disease_data['EJ'] = disease_data['EJ'].map({'A': 0, 'B': 1})
disease_data=disease_data.dropna()
disease_data.describe()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,BN,BP,BQ,BR,BZ,CB,CC,CD,CF,CH,CL,CR,CS,CU,CW,DA,DE,DF,DH,DI,DL,DN,DU,DV,DY,EB,EE,EG,EH,EJ,EL,EP,EU,FC,FD,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
count,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0,923.0
mean,0.547372,4106.318654,121.763091,47.702824,10.873988,5.64969,0.12025,10.686717,17.856899,5658.456893,22.063203,248.66775,116.27189,1421.01174,743.47635,73.950203,0.667977,94.283199,11.988867,0.030079,1.408499,0.685121,35.841466,1.351647,27.040277,47.159525,366.362511,0.710813,0.348798,172.726887,91.161456,26.455776,3.163631,1.926438,27.605021,9.48039,2.715435,1651.478431,0.447616,0.637053,71.640516,101.731443,57.609274,78.885038,9.804354,13045.457847,9.926261,6.809387,10.35941,0.416617,20.772994,120.622532,13610.246845,31.607873,53.938468,8.423114,0.516793
std,0.494297,3096.226699,136.997753,89.811711,9.545821,3.194858,0.893491,4.344597,106.831122,5029.23571,3.36383,235.6201,103.463458,8584.535633,3339.292783,132.253644,0.322052,47.61832,18.487053,0.014051,1.780248,0.271268,15.086568,0.564929,14.286202,22.335254,315.73501,1.399136,0.11014,108.212415,27.01704,8.835711,12.351639,1.322841,18.744709,5.779232,2.09089,1418.813101,2.212684,0.48111,38.597208,58.384218,319.782156,239.81279,75.997407,14974.41955,3.53402,12.064963,100.08081,1.102707,12.031864,126.37117,19248.966829,11.24332,36.518253,10.391324,0.499989
min,0.08546,192.59328,85.200147,3.177522,8.138688,0.699861,0.025578,3.396778,1.2299,1693.62432,9.8868,72.948951,1.331155,51.216883,257.432377,12.49976,0.176874,23.3876,0.510888,0.003184,1.050225,0.069225,13.784111,0.137925,7.03064,6.9064,35.998895,0.23868,0.040995,60.23247,10.3456,6.667048,0.005518,1.74307,0.804068,4.926396,0.286201,185.5941,0.003042,0.0,5.394675,78.526968,3.828384,7.534128,0.29685,1655.963088,3.58345,0.173229,0.49706,0.06773,4.874248,72.611063,13.038894,9.432735,0.897628,0.001129,0.0
25%,0.273472,2415.309435,85.200147,13.230384,8.138688,4.119435,0.025578,8.548663,1.2299,4089.861275,20.1267,155.95281,33.139508,414.193177,257.432377,23.556533,0.529775,67.60602,5.206192,0.023084,1.050225,0.51885,29.105117,1.006852,7.03064,31.09626,151.051665,0.23868,0.282865,109.82505,73.78236,20.62554,0.005518,1.74307,15.254832,6.880698,1.354416,1094.546688,0.003042,0.0,31.549518,78.526968,4.957656,25.24704,0.29685,5712.512327,8.029685,0.173229,0.49706,0.06773,13.687893,72.611063,2545.814529,24.78186,26.087796,0.096377,0.0
50%,0.414481,3503.18958,85.200147,22.394407,8.138688,5.085066,0.025578,10.757514,2.519538,4902.30445,22.5984,198.728559,82.16691,594.943346,257.432377,43.813497,0.631361,83.735536,8.817379,0.027462,1.050225,0.683475,34.186548,1.307529,35.729304,46.26124,263.69623,0.23868,0.349824,140.831707,93.16128,25.378456,0.365541,1.74307,22.096148,8.470998,2.346042,1456.534713,0.09126,1.0,81.404466,78.526968,24.945264,34.489056,1.798911,8016.540306,9.559542,3.87897,1.17798,0.243828,18.408658,72.611063,7669.056672,29.701022,45.259484,0.290093,1.0
75%,0.645223,4889.85805,99.687568,42.131588,9.621429,6.502506,0.034561,12.960063,6.780263,5921.440935,24.3639,256.658733,157.485275,1009.136523,257.432377,81.436856,0.740217,103.729952,12.921209,0.03383,1.26027,0.816375,39.568732,1.616481,37.961132,60.53091,464.499725,0.671814,0.40995,201.030293,107.94934,30.533988,2.772594,1.74307,35.253216,10.647942,3.490846,1876.796537,0.322452,1.0,109.125159,110.412328,47.662368,54.723816,6.168543,13874.57486,11.172095,8.153058,1.630525,0.535067,24.441005,116.463039,15159.38733,36.335565,71.438586,21.978,1.0
max,4.435374,28688.18766,1910.123198,630.51823,178.943634,38.27088,10.315851,38.971568,1463.693448,53060.59924,28.9542,2447.81055,344.644105,179250.2529,50092.4593,2271.436167,4.103032,485.169816,200.967526,0.224074,31.688153,3.039675,267.942823,4.951507,64.521624,210.33092,2103.40519,16.155828,1.060404,1049.168078,326.2362,61.197632,161.355315,25.19293,152.355164,94.95858,18.324926,30243.75878,42.569748,1.0,109.125159,1063.594578,6501.26448,3030.655824,1578.654237,143224.6823,35.851039,137.932739,1244.22702,31.365763,135.781294,1497.351958,143790.0712,81.210825,191.194764,21.978,1.0


In [5]:
print(len(disease_data))
print(len(disease_data.columns))
disease_data= disease_data.reset_index(drop=True)
disease_data=disease_data.drop("Id",axis=1)
disease_data_class=disease_data["Class"]
disease_data=disease_data.drop("Class",axis=1)
print(disease_data)

923
58
           AB           AF          AH          AM         AR         AX   
0    0.145282    978.76416   85.200147   36.968889   8.138688   3.632190  \
1    0.470030   2635.10654   85.200147   32.360553   8.138688   6.732840   
2    0.252107   3819.65177  120.201618   77.112203   8.138688   3.685344   
3    0.209377   2615.81430   85.200147    8.541526   8.138688   4.013127   
4    0.348249   1733.65412   85.200147    8.377385  15.312480   1.913544   
..        ...          ...         ...         ...        ...        ...   
918  0.769140   9762.58976  238.887870   47.185892   8.138688   5.315400   
919  1.734838   9189.95242   85.200147  630.518230   8.138688  11.596431   
920  0.260653   5010.12842   85.200147   11.068678   8.138688   2.693136   
921  0.965698  18720.82960   85.200147   21.291875   9.627984   6.183582   
922  1.008428   2913.38234   85.200147   36.185348   8.138688   6.520224   

           AY         AZ         BC         BD        BN          BP   
0    0.0

# Part 4 : Generating New Data Features

In [6]:
def euclidean_distance(sample1, sample2):
    """Calculate the Euclidean distance between two samples."""
    return distance.euclidean(sample1, sample2)

In [7]:
def euclidean_feature_generator(Sample_List):
    euclidean_feature_dataframe=pd.DataFrame()
    
    for Sample in range(0,len(Sample_List)):#0:640
        SamplePairsData=[]

        for Alternate_Sample in range(0,len(Sample_List)):
            SamplePairsData.append(euclidean_distance(Sample_List.iloc[Sample,:],Sample_List.iloc[Alternate_Sample,:]))

        temp_df = pd.DataFrame([SamplePairsData])
        euclidean_feature_dataframe = pd.concat([euclidean_feature_dataframe, temp_df], ignore_index=False)
    return(euclidean_feature_dataframe)


In [8]:
def TrainingDataSelection(Samples,DataFrame,Euclidean_Distance_DataFrame):#The function takes two arguments: Samples, which is a list of indices representing the rows of the DataFrame to be selected, and DataFrame, which is the original DataFrame from which the data is to be selected.
    New_Euclidean_Distance_DataFrame = pd.DataFrame(np.zeros(Euclidean_Distance_DataFrame.shape), columns=Euclidean_Distance_DataFrame.columns, index=Euclidean_Distance_DataFrame.index)
    Absent=[]
    for row in range(0,len(Euclidean_Distance_DataFrame)):

        if row in Samples:
            for sampletosample in range(0,len(Euclidean_Distance_DataFrame.columns)):
                if sampletosample in Samples:
                    New_Euclidean_Distance_DataFrame.iloc[row,sampletosample]=Euclidean_Distance_DataFrame.iloc[row,sampletosample]
        else:
            Absent.append(row)

    DataFrame= DataFrame.reset_index(drop=True)
    New_Euclidean_Distance_DataFrame= New_Euclidean_Distance_DataFrame.reset_index(drop=True)
    Final_DataFrame = pd.concat([DataFrame, New_Euclidean_Distance_DataFrame], axis=1)
    Final_DataFrame=Final_DataFrame.drop(Absent)
    return(Final_DataFrame)

In [9]:
from sklearn.model_selection import train_test_split
def split_data(df, test_size=0.2, val_size=0.25, random_state=None):

    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    train_df, val_df = train_test_split(train_df, test_size=val_size, random_state=random_state)

    train_indices = train_df.index.tolist()
    val_indices = val_df.index.tolist()
    test_indices = test_df.index.tolist()

    return (train_df, train_indices), (val_df, val_indices), (test_df, test_indices)
(train_df, train_indices),(val_df, val_indices), (test_df, test_indices)= split_data(disease_data)
if 0 in train_indices:
    print("blue")

blue


In [10]:
Euclidean_Distance_DataFrame=(euclidean_feature_generator(disease_data))
#TrainingDataSelection(train_indices,disease_data,Euclidean_Distance_DataFrame)

In [None]:

TrainingData=TrainingDataSelection(train_indices,disease_data,Euclidean_Distance_DataFrame)
TestData=TrainingDataSelection(test_indices,disease_data,Euclidean_Distance_DataFrame)

In [None]:
TrainingData.to_excel('Training Data.xlsx', index=False)
TrainingLabels=disease_data_class.iloc[train_indices]
TestData.to_excel("Test Data.xlsx",index=False)
TestLabels=disease_data_class.iloc[test_indices]
disease_data.to_excel("disease data.xlsx", index=False)

# Part 5 : Creation of a machine learning model

In [None]:
import tensorflow as tf
Training_Numpy_Array=TrainingData.values
Training_Tensor=tf.convert_to_tensor(Training_Numpy_Array)

Test_Numpy_Array=TestData.values
Test_Tensor=tf.convert_to_tensor(Test_Numpy_Array)

Training_Label_Numpy_Array=TrainingLabels.values
Training_Label_Tensor=tf.convert_to_tensor(Training_Label_Numpy_Array)

Test_Label_Numpy_Array=TestLabels.values
Test_Label_Tensor=tf.convert_to_tensor(Test_Label_Numpy_Array)

In [None]:
Training_Tensor.shape,Test_Tensor.shape,Training_Label_Tensor.shape,Test_Label_Tensor.shape

(TensorShape([553, 979]),
 TensorShape([185, 979]),
 TensorShape([553]),
 TensorShape([185]))

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
DiseaseModel = keras.Sequential([
    layers.Dense(5000,activation="relu"),
    layers.Dense(250,activation="relu"),
    layers.Dense(1,activation="sigmoid")
])
DiseaseModel.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])
Blackbox=DiseaseModel.fit(Training_Tensor,Training_Label_Tensor, epochs=1500, batch_size=30)
Results = DiseaseModel.evaluate(Test_Tensor, Test_Label_Tensor)

NameError: name 'Training_Tensor' is not defined

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

# Make predictions
predictions = DiseaseModel.predict(Test_Tensor)
print(predictions)
# Convert probabilities to class labels: class 1 if probability > 0.5 else class 0
predicted_classes = (predictions > 0.5).astype(int)

# Assuming Test_Label_Tensor is already a numpy array
# If it is a TensorFlow tensor, you need to convert it using: Test_Label_Tensor = Test_Label_Tensor.numpy()

# Identify indices where true class is 1
indices_class_1 = np.where(Test_Label_Tensor == 1)

# Compute accuracy on class 1
accuracy_class_1 = accuracy_score(Test_Label_Tensor[indices_class_1], predicted_classes[indices_class_1])

print('Accuracy for class 1: ', accuracy_class_1)


[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.

TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got array([  0,   1,   2,   3,   8,  11,  16,  18,  22,  24,  27,  28,  31,
        32,  33,  34,  39,  40,  42,  43,  45,  48,  50,  52,  55,  59,
        60,  61,  62,  67,  70,  71,  74,  75,  83,  84,  86,  87, 100,
       102, 106, 108, 110, 113, 114, 121, 123, 124, 126, 128, 129, 130,
       131, 134, 137, 138, 139, 143, 145, 146, 147, 148, 150, 156, 159,
       160, 162, 163, 165, 166, 171, 174, 176, 177, 178, 181], dtype=int64)

In [None]:
len(disease_data_class),disease_data_class.sum()
ProportionSick=102/548
ProportionHealthy=446/548
print(ProportionSick,ProportionHealthy)

0.18613138686131386 0.8138686131386861


# Part 6 : Analysis of models efficiency and loss values

# Part 7 : Log of models