Importing the Dependencies

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

Data Collection and Processing

In [5]:
# Loading the csv data to a pandas DataFrame
cholera_data = pd.read_csv('/content/cholera_dataset - Casedata.csv')

In [6]:
#Print 5 rows of the dataset
cholera_data.head()

Unnamed: 0,Sex,Vomiting,Diarrhea,Abdominal Cramps,Dehydration,LabResult
0,1,1,1,0,1,
1,1,1,1,0,1,1.0
2,0,1,1,0,1,
3,1,1,1,0,1,1.0
4,0,1,1,0,1,1.0


In [7]:
# number of rows and columns
cholera_data.shape

(851, 6)

In [8]:
# getting some info about the data
cholera_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 851 entries, 0 to 850
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Sex               851 non-null    int64  
 1   Vomiting          851 non-null    int64  
 2   Diarrhea          851 non-null    int64  
 3   Abdominal Cramps  851 non-null    int64  
 4   Dehydration       851 non-null    int64  
 5   LabResult         452 non-null    float64
dtypes: float64(1), int64(5)
memory usage: 40.0 KB


In [9]:
# checking for missing values
cholera_data.isnull().sum()

Sex                   0
Vomiting              0
Diarrhea              0
Abdominal Cramps      0
Dehydration           0
LabResult           399
dtype: int64

In [10]:
# statistical mesaure of the data
cholera_data.describe()

Unnamed: 0,Sex,Vomiting,Diarrhea,Abdominal Cramps,Dehydration,LabResult
count,851.0,851.0,851.0,851.0,851.0,452.0
mean,0.594595,0.944771,0.991774,0.112808,0.93537,0.778761
std,0.491259,0.228561,0.090375,0.316545,0.246016,0.415541
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,1.0,1.0
50%,1.0,1.0,1.0,0.0,1.0,1.0
75%,1.0,1.0,1.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
# checking the distribution of Target Variable
cholera_data['LabResult'].value_counts()

1.0    352
0.0    100
Name: LabResult, dtype: int64

Splitting the Features and Target

In [12]:
X = cholera_data.drop(columns='LabResult', axis=1)
Y = cholera_data['LabResult']

In [13]:
print(X)

     Sex  Vomiting  Diarrhea  Abdominal Cramps  Dehydration
0      1         1         1                 0            1
1      1         1         1                 0            1
2      0         1         1                 0            1
3      1         1         1                 0            1
4      0         1         1                 0            1
..   ...       ...       ...               ...          ...
846    0         0         1                 0            1
847    0         1         1                 0            1
848    1         1         0                 0            1
849    1         1         1                 0            1
850    1         1         1                 0            1

[851 rows x 5 columns]


In [14]:
print(Y)

0      NaN
1      1.0
2      NaN
3      1.0
4      1.0
      ... 
846    1.0
847    0.0
848    NaN
849    1.0
850    1.0
Name: LabResult, Length: 851, dtype: float64


Splitting the Data into Training data & Test Data

In [15]:
X.dropna(inplace=True)
Y.dropna(inplace=True)

Dropping the untested case data

In [16]:
X = X.loc[Y.index]

Filter X to match Y's index


In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [18]:
print(X.shape, X_train.shape, X_test.shape)

(452, 5) (361, 5) (91, 5)


Model Training

Decision Trees

In [31]:
model = DecisionTreeClassifier()

In [32]:
model.fit(X_train, Y_train)

Model Evaluation

In [33]:
Y_pred = model.predict(X_test)

Accuracy Score

In [34]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_test, Y_pred)

In [36]:
# Print accuracy
print(f"Accuracy score on test data: {accuracy}")

Accuracy score on test data: 0.7692307692307693


In [38]:
import pickle

with open('model.pkl', 'wb') as file:
   pickle.dump(model, file)

In [39]:
pickle_in = open('model.pkl', 'rb')
model = pickle.load(pickle_in)

In [40]:
# Encode features in both train and test sets

le = LabelEncoder()

X_train['Vomiting'] = le.fit_transform(X_train['Vomiting'])
X_test['Vomiting'] = le.transform(X_test['Vomiting'])




In [41]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [42]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.7692307692307693


In [43]:
# Input data
sex = '0'
vomiting = 0
diarrhea = 1
abdominal_cramps = 0
dehydration = 1


In [47]:
# Encode categorical features
sex = sex
vomiting = vomiting
diarrhea = diarrhea
abdominal_cramps = abdominal_cramps
dehydration = dehydration

In [48]:
# Collect input data
input_data = [sex,vomiting, diarrhea, abdominal_cramps, dehydration]

In [49]:
# Convert to numpy array
import numpy as np
input_data_numpy = np.asarray(input_data)

In [50]:
# Reshape
input_data_reshaped = input_data_numpy.reshape(1,-1)

In [51]:
# Make prediction
result = model.predict(input_data_reshaped)



In [52]:
# Print output
if result[0] == 1:
  print('Cholera Positive')
else:
  print('Cholera Negative')

Cholera Positive


Saving the pretrained model

In [53]:
import pickle

In [54]:
filename = 'cholera_disease_model.sav'
pickle.dump(model, open(filename, 'wb'))

Loading the model

In [55]:
# loading the saved model
loaded_model = pickle.load(open('cholera_disease_model.sav', 'rb'))

In [56]:
for column in X.columns:
  print(column)

Sex
Vomiting
Diarrhea
Abdominal Cramps
Dehydration
