# Binary Classification Model for Disease Prediction


# Part 1: Installation of necessary libraries

In [159]:
import numpy as np
import pandas as pd
import os 
import random
import tensorflow
from scipy.spatial import distance


# Part 2: Download of patient data with corresponding disease status

In [160]:
pd.set_option("display.max_columns", None) # This ensures the full dataframe is visulised and not truncated
disease_data = pd.read_csv('Training.csv')
print(disease_data)


               Id        AB          AF          AH          AM         AR   
0    000ff2bfdfe9  0.209377  3109.03329   85.200147   22.394407   8.138688  \
1    007255e47698  0.145282   978.76416   85.200147   36.968889   8.138688   
2    013f2bd269f5  0.470030  2635.10654   85.200147   32.360553   8.138688   
3    043ac50845d5  0.252107  3819.65177  120.201618   77.112203   8.138688   
4    044fb8a146ec  0.380297  3733.04844   85.200147   14.103738   8.138688   
..            ...       ...         ...         ...         ...        ...   
612  fd3dafe738fd  0.149555  3130.05946  123.763599    9.513984  13.020852   
613  fd895603f071  0.435846  5462.03438   85.200147   46.551007  15.973224   
614  fd8ef6377f76  0.427300  2459.10720  130.138587   55.355778  10.005552   
615  fe1942975e40  0.363205  1263.53524   85.200147   23.685856   8.138688   
616  ffcca4ded3bb  0.482849  2672.53426  546.663930  112.006102   8.138688   

           AX        AY         AZ          BC         BD      

# Part 3 : Exploratory Data Analysis


In [161]:
#print(len(disease_data.columns))
disease_data['EJ'] = disease_data['EJ'].map({'A': 0, 'B': 1})
disease_data_description = disease_data.describe()
print((disease_data_description.columns))
#disease_data.dtypes

Index(['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN', 'BP',
       'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU',
       'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 'EB',
       'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI', 'FL',
       'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class'],
      dtype='object')


In [162]:

Classes=disease_data["Class"].value_counts()
print(Classes) # 0 is a disease free individual 1 is a disease carrying individual

Class
0    509
1    108
Name: count, dtype: int64


In [163]:

print(disease_data.isnull().sum())
disease_data=disease_data.dropna()
disease_data.isnull().sum()


Id        0
AB        0
AF        0
AH        0
AM        0
AR        0
AX        0
AY        0
AZ        0
BC        0
BD        0
BN        0
BP        0
BQ       60
BR        0
BZ        0
CB        2
CC        3
CD        0
CF        0
CH        0
CL        0
CR        0
CS        0
CU        0
CW        0
DA        0
DE        0
DF        0
DH        0
DI        0
DL        0
DN        0
DU        1
DV        0
DY        0
EB        0
EE        0
EG        0
EH        0
EJ        0
EL       60
EP        0
EU        0
FC        1
FD        0
FE        0
FI        0
FL        1
FR        0
FS        2
GB        0
GE        0
GF        0
GH        0
GI        0
GL        1
Class     0
dtype: int64


Id       0
AB       0
AF       0
AH       0
AM       0
AR       0
AX       0
AY       0
AZ       0
BC       0
BD       0
BN       0
BP       0
BQ       0
BR       0
BZ       0
CB       0
CC       0
CD       0
CF       0
CH       0
CL       0
CR       0
CS       0
CU       0
CW       0
DA       0
DE       0
DF       0
DH       0
DI       0
DL       0
DN       0
DU       0
DV       0
DY       0
EB       0
EE       0
EG       0
EH       0
EJ       0
EL       0
EP       0
EU       0
FC       0
FD       0
FE       0
FI       0
FL       0
FR       0
FS       0
GB       0
GE       0
GF       0
GH       0
GI       0
GL       0
Class    0
dtype: int64

# Part 4 : Generating New Data Features

In [164]:
def euclidean_distance(sample1, sample2):
    """Calculate the Euclidean distance between two samples."""
    return distance.euclidean(sample1, sample2)

def euclidean_feature_generator(Sample_List):
    euclidean_feature_dataframe=pandas.DataFrame()
    for Sample in Sample_List:
        SamplePairsData=[]
        SamplePairsColumns=[]
        for Alternate_Sample in Sample_List:
            if Sample!=Alternate_Sample:
                SamplePairsData.append(euclidean_distance(Sample,Alternate_Sample))
                SamplePairsColumns.append("{A}-{B}".format(A=Sample,B=Alternate_Sample))
            else:
                pass
        euclidean_feature_dataframe.append(SamplePairsData)
        temp_df = pd.DataFrame([SamplePairsData], columns=SamplePairsColumns)
        euclidean_feature_dataframe = euclidean_feature_dataframe.append(temp_df, ignore_index=True)
    return(euclidean_feature_dataframe)
    euclidean_feature_generator()        

# Part 5 : Creation of a machine learning model

# Part 6 : Analysis of models efficiency and loss values

# Part 7 : Log of models