Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

Data Collection and Processing

In [None]:
# Loading the csv data to a pandas DataFrame
cholera_data = pd.read_csv('/content/cholera_dataset - Casesdata.csv')

In [None]:
#Print 5 rows of the dataset
cholera_data.head()

Unnamed: 0,Age Category,Sex,Occupation,Water Source,Vomiting,Diarrhea,Abdominal Cramps,Dehydration,LabResult
0,1,M,Child,Well,Yes,Yes,No,Yes,
1,1,M,Child,Well,Yes,Yes,No,Yes,Positive
2,1,F,Child,Well,Yes,Yes,No,Yes,
3,1,M,Child,Well,Yes,Yes,No,Yes,Positive
4,1,F,Child,Lake,Yes,Yes,No,Yes,Positive


In [None]:
# number of rows and columns
cholera_data.shape

(852, 9)

In [None]:
# getting some info about the data
cholera_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852 entries, 0 to 851
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age Category      852 non-null    int64 
 1   Sex               852 non-null    object
 2   Occupation        852 non-null    object
 3   Water Source      852 non-null    object
 4   Vomiting          852 non-null    object
 5   Diarrhea          852 non-null    object
 6   Abdominal Cramps  852 non-null    object
 7   Dehydration       852 non-null    object
 8   LabResult         452 non-null    object
dtypes: int64(1), object(8)
memory usage: 60.0+ KB


In [None]:
# checking for missing values
cholera_data.isnull().sum()

Age Category          0
Sex                   0
Occupation            0
Water Source          0
Vomiting              0
Diarrhea              0
Abdominal Cramps      0
Dehydration           0
LabResult           400
dtype: int64

In [None]:
# statistical mesaure of the data
cholera_data.describe()

Unnamed: 0,Age Category
count,852.0
mean,2.703052
std,0.709075
min,1.0
25%,3.0
50%,3.0
75%,3.0
max,4.0


In [None]:
# checking the distribution of Target Variable
cholera_data['LabResult'].value_counts()

Positive    352
Negative    100
Name: LabResult, dtype: int64

Splitting the Features and Target

In [None]:
X = cholera_data.drop(columns='LabResult', axis=1)
Y = cholera_data['LabResult']

In [None]:
print(X)

     Age Category Sex  Occupation Water Source Vomiting Diarrhea  \
0               1   M       Child         Well      Yes      Yes   
1               1   M       Child         Well      Yes      Yes   
2               1   F       Child         Well      Yes      Yes   
3               1   M       Child         Well      Yes      Yes   
4               1   F       Child         Lake      Yes      Yes   
..            ...  ..         ...          ...      ...      ...   
847             4   F  House Wife          Tap       No      Yes   
848             4   F     Peasant          Tap      Yes      Yes   
849             4   M     Peasant          Tap      Yes       No   
850             4   M     Peasant          Tap      Yes      Yes   
851             4   M     Peasant         Well      Yes      Yes   

    Abdominal Cramps Dehydration  
0                 No         Yes  
1                 No         Yes  
2                 No         Yes  
3                 No         Yes  
4       

In [None]:
print(Y)

0           NaN
1      Positive
2           NaN
3      Positive
4      Positive
         ...   
847    Positive
848    Negative
849         NaN
850    Positive
851    Positive
Name: LabResult, Length: 852, dtype: object


Splitting the Data into Training data & Test Data

In [None]:
X.dropna(inplace=True)
Y.dropna(inplace=True)

Dropping the untested case data

In [None]:
X = X.loc[Y.index]

Filter X to match Y's index


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(452, 8) (361, 8) (91, 8)


Model Training

Logistic Regression

In [None]:
model = LogisticRegression()

In [None]:
# Encode Sex features as integers
le = LabelEncoder()
X_train['Sex'] = le.fit_transform(X_train['Sex'])
X_test['Sex'] = le.transform(X_test['Sex'])

# Encoding more Features
le = LabelEncoder()

X_train['Occupation'] = le.fit_transform(X_train['Occupation'])
X_test['Occupation'] = le.transform(X_test['Occupation'])

le = LabelEncoder()

X_train['Water Source'] = le.fit_transform(X_train['Water Source'])
X_test['Water Source'] = le.transform(X_test['Water Source'])

le = LabelEncoder()

X_train['Vomiting'] = le.fit_transform(X_train['Vomiting'])
X_test['Vomiting'] = le.transform(X_test['Vomiting'])

# same for the 'Diarrhea' column

le = LabelEncoder()

X_train['Diarrhea'] = le.fit_transform(X_train['Diarrhea'])
X_test['Diarrhea'] = le.transform(X_test['Diarrhea'])




In [None]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.778393351800554


In [None]:
# Fit LabelEncoder on training data only
le = LabelEncoder()
le.fit(X_train['Dehydration'])

# Convert columns to string to allow .str accessor
X_train['Dehydration'] = X_train['Dehydration'].astype(str)
X_test['Dehydration'] = X_test['Dehydration'].astype(str)

# Clean data
X_train['Dehydration'] = X_train['Dehydration'].str.lower().replace({'yes': 'Yes', 'no': 'No'})
X_test['Dehydration'] = X_test['Dehydration'].str.lower().replace({'yes': 'Yes', 'no': 'No'})

unique_vals = pd.concat([X_train, X_test], ignore_index=True)['Dehydration'].unique()
le.fit(unique_vals)

# Encode train and test data
X_train['Dehydration'] = le.transform(X_train['Dehydration'])
X_test['Dehydration'] = le.transform(X_test['Dehydration'])

In [None]:
# Encode features in both train and test sets

le = LabelEncoder()

X_train['Vomiting'] = le.fit_transform(X_train['Vomiting'])
X_test['Vomiting'] = le.transform(X_test['Vomiting'])




In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.7472527472527473


In [None]:
# Input data
age_category = 2
sex = 'Female'
occupation = 'Peasant'
water_source = 'River'
vomiting = 0
diarrhea = 1
abdominal_cramps = 0
dehydration = 1


In [None]:
# Encoding mappings
sex_mapping = {'Male':0, 'Female':1}
occ_mapping = {'Child':0, 'Peasant':1, 'Fisherman':2, 'House Wife':3}
water_mapping = {'Well':0, 'Lake':1, 'River':2, 'Tap':3}



In [None]:
# Encode categorical features
sex = sex_mapping[sex]
occupation = occ_mapping[occupation]
water_source = water_mapping[water_source]
vomiting = vomiting
diarrhea = diarrhea
abdominal_cramps = abdominal_cramps
dehydration = dehydration

In [None]:
# Standardize age_category
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
age_scaled = scaler.fit_transform([[age_category]])

In [None]:
# Collect input data
input_data = [age_scaled[0], sex, occupation, water_source,
              vomiting, diarrhea, abdominal_cramps, dehydration]

In [None]:
# Convert to numpy array
import numpy as np
input_data_numpy = np.asarray(input_data)

  input_data_numpy = np.asarray(input_data)


In [None]:
# Reshape
input_data_reshaped = input_data_numpy.reshape(1,-1)

In [None]:
# Make prediction
result = model.predict(input_data_reshaped)



In [None]:
# Print output
if result[0] == 1:
  print('Cholera Positive')
else:
  print('Cholera Negative')

Cholera Negative


Saving the pretrained model

In [None]:
import pickle

In [None]:
filename = 'cholera_disease_model.sav'
pickle.dump(model, open(filename, 'wb'))

Loading the model

In [None]:
# loading the saved model
loaded_model = pickle.load(open('cholera_disease_model.sav', 'rb'))

In [None]:
for column in X.columns:
  print(column)

Age Category
Sex
Occupation
Water Source
Vomiting
Diarrhea
Abdominal Cramps
Dehydration
