In [None]:
#Description: This program classfies patients as having Anaemic Disease or not using Artificial Neural Networks (ANN)

In [None]:
#Importing Libraries
import glob
from keras.models import Sequential, load_model
import numpy as np
import pandas as pd
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import keras as k
import seaborn as sns                     # data visualization library based on matplotlib
import sklearn                            # for random forests, and k-neighbours etc 

sns.set(rc={'figure.figsize':(12,10)})

In [None]:
#Load the data
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('Anaemia1.csv')

#Print the first 10 rows
df.head(10)

In [None]:
df.info()

In [None]:
#Get the shape of the data (the number of rows & cols)
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#Create a list of column names to keep
column_to_retain = ['Age', 'Sex', 'Haemoglobin', 'MCH', 'MCHC', 'MCV', 'RBC', 'Platlets', 'IDENTIFICATION']

#Drop the columns that are not in columns_to_retain
df = df.drop([col for col in df.columns if not col in column_to_retain], axis=1)

#Drop the rows with na or missing values
df = df.dropna(axis=0)

In [None]:
#Transform the non-numeric data in the columns
for column in df.columns:
  if df[column].dtype == np.number:
    continue
  df[column] = LabelEncoder().fit_transform( df[column])

In [None]:
#Print the first 5 rows of the new cleaned dataset
df.head()

In [None]:
df['IDENTIFICATION'].value_counts()

In [None]:
#visualize the count
sns.countplot(df['IDENTIFICATION'])

In [None]:
#visualize the data
sns.countplot(x='Haemoglobin', hue='IDENTIFICATION', data=df, palette='bright',edgecolor= sns.color_palette('dark',n_colors=1))

In [None]:
df.hist(figsize=(20,16))
plt.show()

In [None]:
sns.pairplot(df, hue = "IDENTIFICATION", height = 5, aspect = 1);

In [None]:
#get the correlation of the columns
df.corr()

In [None]:
#visualize the data
import matplotlib.pyplot as plt
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(),annot=True,fmt='.0%')

In [None]:
#Split the data into independent (X) dataset (the features) and dependent (Y) dataset (the target)
X = df.drop(['IDENTIFICATION'], axis=1)
Y = df['IDENTIFICATION']

In [None]:
#Feature Scaling 
#Min-Max scaler method scales the dataset so that all the input features lie between 0 and 1
X_scaler = MinMaxScaler()
X_scaler.fit(X)
column_names = X.columns
X[column_names] = X_scaler.transform(X)

In [None]:
#Split the data into 80% training and 20% testing &shuffle
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, shuffle =True)

In [None]:
len(X_train)

In [None]:
len(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [None]:
#create param
model_param = {
    'DecisionTreeClassifier':{
        'model':DecisionTreeClassifier(),
        'param':{
            'criterion': ['gini','entropy']
        }
    },
    'RandomForestClassifier':{
        'model':RandomForestClassifier(),
        'param':{
            'n_estimators': [20,50,80,120,150]
        }
    },
    'KNeighborsClassifier':{
        'model':KNeighborsClassifier(),
        'param':{
            'n_neighbors': [5,10,15,20,25]
        }
    },
    'LogisticRegression':{
        'model':LogisticRegression(),
        'param':{
            'penalty':['l2']
        }
    },
    'SVC':{
        'model':SVC(),
        'param':{
            'kernel': ['rbf','linear','sigmoid']
        }
    },
    'AdaBoostClassifier':{
        'model':AdaBoostClassifier(),
        'param':{
            'learning_rate': [1,2,3,4,5]
        }
    },

}

In [None]:
scores = []
for model_name, mp in model_param.items():
  model_selection = GridSearchCV(estimator=mp['model'],param_grid=mp['param'],cv=5,return_train_score=False)
  model_selection.fit(X,Y)
  scores.append({
      'model': model_name,
      'best_score': model_selection.best_score_,
      'best_params': model_selection.best_params_
  })

In [None]:
df_model_score = pd.DataFrame(scores, columns=['model','best_score','best_params'])
df_model_score

In [None]:
colors = ["yellow","blue","brown","magenta","red", "purple"]
fig = plt.figure(figsize=(15,15))
sns.barplot(x='model',y='best_score',data=df_model_score, palette= colors)
plt.title('Model Comparison');

In [None]:
model_rfc = RandomForestClassifier(n_estimators=120)

In [None]:
model_rfc.fit(X_train,Y_train)

In [None]:
model_rfc.score(X_test,Y_test)

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test,model_rfc.predict(X_test))
cm

In [None]:
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('True value')
plt.show()

In [None]:
#Build the model
model = Sequential()
model.add( Dense(256, input_dim = len(X.columns), kernel_initializer= k.initializers.random_normal(seed=13), activation='relu'))
model.add( Dense(1, activation = 'hard_sigmoid'))

In [None]:
#Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#Train the model
history = model.fit(X_train, Y_train, epochs = 100, batch_size = X_train.shape[0])

In [None]:
#Save the model
model.save('ana.model')

In [None]:
#Visualize the models loss and accuracy 
plt.plot(history.history['accuracy'])
plt.plot(history.history['loss'])
plt.title('model accuracy & loss')
plt.ylabel('accuracy and loss')
plt.xlabel('epoch')

In [None]:
max(history.history['accuracy'])

In [None]:
test_y_predictions = model.predict(X_test)
test_y_predictions

In [None]:
#Get the shape of the training and testing dataset
print('shape of training data:', X_train.shape)
print('shape of test data:', X_test.shape)

In [None]:
#Show the actual and predicted values
pred = model.predict(X_test)
pred = [1 if Y>=0.5 else 0 for Y in pred]
pred

print('Original : {0}'.format(", ".join(str(x) for x in Y_test)))
print('Predicted : {0}'.format(", ".join(str(x) for x in pred)))

In [None]:
#Show the actual values
Y_test