# Typhoid

## Import libraries

In [None]:
import pandas as pd
import seaborn as sp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import pickle as pk

In [None]:
typhoid = pd.read_csv('Dataset/final_typhoid.csv')
typhoid.head()


In [None]:
typhoid.drop(['Unnamed: 0'], axis=1)


In [None]:
typhoid.drop_duplicates()

## CLEANING

In [None]:
typhoid['DISTRICT'].fillna(typhoid['DISTRICT'].mode()[0], inplace = True)
typhoid['AGE'].fillna(typhoid['AGE'].median(), inplace = True)
typhoid['REPORT_VERIFIED'].fillna(typhoid['REPORT_VERIFIED'].mode()[0], inplace = True)
typhoid['TEHSIL'].fillna(typhoid['TEHSIL'].mode()[0], inplace = True)


## DATA BALANCING

In [None]:
#Class Distribution
class_counts_typ = typhoid['RESULT_TEXT'].value_counts()

class_distribution_typ =class_counts_typ / len(typhoid) *100

print(class_distribution_typ)

#plot a bar graph
value = typhoid['RESULT_TEXT'].value_counts()

plt.bar(value.index, value.values)

plt.title('RESULT_TEXT')
plt.xlabel('Result')
plt.ylabel('Count')

plt.show()

In [None]:
# Resample data to handle imbalance
x = typhoid.drop('RESULT_TEXT', axis = 1)
y = typhoid['RESULT_TEXT']

ros = RandomOverSampler(random_state=42)
x_resampled, y_resampled = ros.fit_resample(x, y)

#Concatenate the features and target into balanced datset
balanced_data = pd.concat([x_resampled, y_resampled], axis=1)

balanced_data.to_csv('new_copy2/Balanced_Typhoid.csv', index = False)

Balanced_typhoid = pd.read_csv('new_copy2/Balanced_Typhoid.csv')

#Class Distribution
class_counts_typ = Balanced_typhoid['RESULT_TEXT'].value_counts()

class_distribution_typ =class_counts_typ / len(Balanced_typhoid) *100

print(class_distribution_typ)

#plot a bar graph
value = Balanced_typhoid['RESULT_TEXT'].value_counts()

plt.bar(value.index, value.values)

plt.title('RESULT_TEXT')
plt.xlabel('Result')
plt.ylabel('Count')

plt.show()

## DATA TRANSFORMATION

In [None]:
typhoid.shape

In [None]:
le = LabelEncoder()
Balanced_typhoid["MRNO_encoded"] = le.fit_transform(Balanced_typhoid["MRNO"])
Balanced_typhoid["RESULT_VALUE_encoded"] = le.fit_transform(Balanced_typhoid["RESULT_VALUE"])
Balanced_typhoid["GENDER_encoded"] = le.fit_transform(Balanced_typhoid["GENDER"])
Balanced_typhoid["REPORT_VERIFIED_encoded"] = le.fit_transform(Balanced_typhoid["REPORT_VERIFIED"])
Balanced_typhoid["RESULT_TEXT_encoded"] = le.fit_transform(Balanced_typhoid["RESULT_TEXT"])
# One-hot encode District and Tehsil
ohe = OneHotEncoder(sparse=False)
district_tehsil_encoded = ohe.fit_transform(Balanced_typhoid[["DISTRICT", "TEHSIL"]])
district_tehsil_encoded_df = pd.DataFrame(district_tehsil_encoded, columns=ohe.get_feature_names_out(["DISTRICT", "TEHSIL"]))

# Combine the encoded columns with the original dataset
new_df = pd.concat([Balanced_typhoid["MRNO_encoded"], district_tehsil_encoded_df], axis=1)
new_df["AGE"] = Balanced_typhoid["AGE"]
new_df["RESULT_TEXT"] = Balanced_typhoid["RESULT_TEXT_encoded"]
new_df["GENDER"] = Balanced_typhoid["GENDER_encoded"]
new_df["RESULT_VALUE"] = Balanced_typhoid["RESULT_VALUE_encoded"]
new_df["REPORT_VERIFIED"] = Balanced_typhoid["REPORT_VERIFIED_encoded"]
new_df["CPT_ID"] = Balanced_typhoid["CPT_ID"]
new_df["CPT_ID.1"] = Balanced_typhoid["CPT_ID.1"]
# Save the new dataframe to a new CSV file
new_df.to_csv('new_copy2/New_Typhoid.csv', index = False)

In [None]:
print(Balanced_typhoid['MRNO_encoded'])

## NORMALISATION

In [None]:
value = Balanced_typhoid['AGE'].value_counts()

plt.bar(value.index, value.values)

plt.title('AGE')
plt.xlabel('Age')
plt.ylabel('Count')

plt.show()

In [None]:
Transformed_typhoid = pd.read_csv('new_copy2/New_Typhoid.csv', low_memory = False)
# Column to be normalized
column = ['AGE']

Transformed_typhoid[column] = (Transformed_typhoid[column] - Transformed_typhoid[column].mean()) / Transformed_typhoid[column].std()

# New .csv file with normalized data
Transformed_typhoid.to_csv('new_copy2/Normalized_Typhoid.csv', index = False)# Column to be normalized
column = ['AGE']

Transformed_typhoid[column] = (Transformed_typhoid[column] - Transformed_typhoid[column].mean()) / Transformed_typhoid[column].std()

# New .csv file with normalized data
Transformed_typhoid.to_csv('new_copy2/Normalized_Typhoid.csv', index = False)

In [None]:
Normal_typhoid = pd.read_csv('new_copy2/Normalized_Typhoid.csv', low_memory = False)
print(Normal_typhoid['AGE'].head())

value = Normal_typhoid['AGE'].value_counts()

plt.bar(value.index, value.values)

plt.title('AGE')
plt.xlabel('Age')
plt.ylabel('Count')

plt.show()

In [None]:
Normal_typhoid.info()

In [None]:
Normal_typhoid.head()

In [None]:
Normal_typhoid.info()

## Random forest feature selection

In [None]:
# Drop rows with missing data
Normal_typhoid.dropna(inplace=True)

# Split the data into feature matrix X and target vector y
X = Normal_typhoid.drop('RESULT_TEXT', axis=1)
y = Normal_typhoid['RESULT_TEXT']

In [None]:
# Use random forest to select the most important features
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X, y)
importance = rfc.feature_importances_

In [None]:
# feat_imp=pd.Series(importance, index=X.columns)
# feat_imp.nlargest(7).plot(kind="barh")
# print(feat_imp.nlargest(7))
# plt.show()

In [None]:
# Create a list of (feature name, importance) tuples and sort by importance
features = list(zip(X.columns, importance))
features.sort(key=lambda x: x[1], reverse=True)

# Print the sorted list of feature importances
for f in features:
    print(f)


# Select the top k features
k = 7
top_features = [f[0] for f in features[:k]]
for f in top_features:
    print(f)
X = X[top_features]

### HISTOGRAM FOR FEATURE SELECTION

In [None]:

plt.title("Histogram for Feature selection importance");
plt.barh([x[0] for x in features],[x[1] for x in features])
plt.show()


## RANDOM FOREST CLASSIFIER

In [None]:
# Train and evaluate the model using KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
accuracy_scores = []
y_preds=[]
y_tests=[]
for train_index, test_index in kf.split(X):
    # Split data into train and test sets for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create and train the random forest classifier
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(X_train, y_train)

    # Make predictions on the test set and calculate accuracy
    y_pred = rfc.predict(X_test)
    y_preds.extend(y_pred)
    y_tests.extend(y_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Add accuracy score to list
    accuracy_scores.append(accuracy)
pk.dump(rfc,open("model.pkl","wb"))

In [None]:
# Compute and print the mean accuracy score and standard deviation
print("Accuracy- Random forest classifier: %0.2f (+/- %0.2f)" % (np.mean(accuracy_scores), np.std(accuracy_scores) * 2))

### CONFUSION MATRIX FOR RF

In [None]:
# Assuming the true and predicted labels are stored in y_true and y_pred respectively
cm = confusion_matrix(y_tests, y_preds)

# Create a heatmap of the confusion matrix using Seaborn
sp.heatmap(cm, annot=True, cmap="Blues", fmt = 'd')

# Add axis labels and a title
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.title("Confusion Matrix for Random forest")
plt.show()

### ROC CURVE

In [None]:
# y_true: true labels, y_pred_prob: predicted probabilities
fpr, tpr, thresholds = roc_curve(y_tests, y_preds)

# plot ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')  # plot random curve
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Random Forest')
plt.show()

## KNN 

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = []
# Perform K-fold cross-validation and evaluate the model's performance
y_true = []
y_pred = []
for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    knn.fit(X_train, y_train)
    y_true.extend(y_test)
    y_pred.extend(knn.predict(X_test))
    

In [None]:
accuracy = accuracy_score(y_true, y_pred)
# Add accuracy score to list
accuracy_scores.append(accuracy)
# Compute and print the mean accuracy score and standard deviation
print("Accuracy- KNN: %0.2f (+/- %0.2f)" % (np.mean(accuracy_scores), np.std(accuracy_scores) * 2))

### CONFUSION MATRIX for KNN

In [None]:
# Assuming the true and predicted labels are stored in y_true and y_pred respectively
cm = confusion_matrix(y_true, y_pred)
# Create a heatmap of the confusion matrix using Seaborn
sp.heatmap(cm, annot=True, cmap="Blues", fmt="d")

# Add axis labels and a title
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.title("Confusion Matrix for KNN")

# Show the plot
plt.show()

### ROC curve

In [None]:
# y_true: true labels, y_pred_prob: predicted probabilities
fpr, tpr, thresholds = roc_curve(y_true, y_pred)

# plot ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')  # plot random curve
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN')
plt.show()

## DECISION TREE

In [None]:
# Initialize an empty list to store cross-validation scores
scores = []
y_pred=[]
y_preds=[]
y_tests=[]
# Iterate over the splits of the data and train/test the model
for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the decision tree classifier
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    
    # Evaluate the model's performance on the test set for this fold and store the score
    y_pred = dt.predict(X_test)
    y_preds.extend(y_pred)
    y_tests.extend(y_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

# Calculate the mean and standard deviation of the cross-validation scores
mean_score = sum(scores) / len(scores)
std_dev = np.std(scores)

# Print the results
print("Accuracy-Decision tree: %0.2f (+/- %0.2f)" % (mean_score, std_dev * 2))

### CONFUSION MATRIX

In [None]:
# Assuming the true and predicted labels are stored in y_true and y_pred respectively
cm = confusion_matrix(y_tests, y_preds)
np.set_printoptions(precision=3, suppress=True)
# Create a heatmap of the confusion matrix using Seaborn
sp.heatmap(cm, annot=True, cmap="Blues",fmt="d")

# Add axis labels and a title
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.title("Confusion Matrix for Decision tree")

# Show the plot
plt.show()

### ROC CURVE

In [None]:
# y_true: true labels, y_pred_prob: predicted probabilities
fpr, tpr, thresholds = roc_curve(y_tests, y_preds)

# plot ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')  # plot random curve
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Decision tree')
plt.show()

## LOGISITIC REGRESSION

In [None]:
# Perform 10-fold cross validation using KFold method
kf = KFold(n_splits=10, shuffle=True, random_state=42)
lr = LogisticRegression(C=1)
scores = []
y_preds = []
y_true = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    y_preds.extend(y_pred)
    y_true.extend(y_test)
    scores.append(lr.score(X_test, y_test))

# Calculate and print the cross-validation accuracy
print("Accuracy- Logistic Regression: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2))

### CONFUSION MATRIX FOR LOGISTIC REGRESSION

In [None]:
# Assuming the true and predicted labels are stored in y_true and y_pred respectively
cm = confusion_matrix(y_true, y_preds)
np.set_printoptions(precision=3, suppress=True)
# Create a heatmap of the confusion matrix using Seaborn
sp.heatmap(cm, annot=True, cmap="Blues",fmt="d")

# Add axis labels and a title
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.title("Confusion Matrix for Logistic Regression")

# Show the plot
plt.show()

### ROC curve for Logistic Regression

In [None]:
# y_true: true labels, y_pred_prob: predicted probabilities
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# plot ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')  # plot random curve
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Logisitic Regression')
plt.show()

## SUPPORT VECTOR MACHINE

In [None]:
y_pred=[]
y_preds=[]
y_true=[]
# Define the desired sample size for the reduced dataset
sample_size = 5000

# Initialize SVM classifier with default hyperparameters
svm = SVC()

# Use stratified k-fold cross-validation to evaluate classifier performance
skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
scores = []

# Loop over each fold and perform stratified sampling on the training set
for train_index, test_index in skf.split(X,y):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Use stratified sampling to select the desired number of samples from each class
    X_sampled, y_sampled = resample(X_train, y_train, n_samples=sample_size, stratify=y_train, random_state=42)

    # Fit SVM classifier on the reduced dataset and evaluate performance on the test set
    svm.fit(X_sampled, y_sampled)
    y_pred = svm.predict(X_test)
    y_preds.extend(y_pred)
    y_true.extend(y_test)
    scores.append(svm.score(X_test,y_test))

# Print the average classification accuracy over all folds
print("Accuracy- SVM: {:.2f}".format(sum(scores)/len(scores)))



### CONFUSION MATRIX for SVM

In [None]:
# Assuming the true and predicted labels are stored in y_true and y_pred respectively
cm = confusion_matrix(y_true, y_preds)
np.set_printoptions(precision=3, suppress=True)
# Create a heatmap of the confusion matrix using Seaborn
sp.heatmap(cm, annot=True, cmap="Blues",fmt="d")

# Add axis labels and a title
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.title("Confusion Matrix for SVM ")

# Show the plot
plt.show()

### ROC curve for SVM

In [None]:
# y_true: true labels, y_pred_prob: predicted probabilities
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# plot ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')  # plot random curve
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM')
plt.show()

## ENSEMBLE

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Set up KFold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize empty lists to store scores and predictions
scores = []
y_preds = []
y_tests = []

# Train the models and ensemble them
rf = RandomForestClassifier(n_estimators=100)
dt = DecisionTreeClassifier(random_state=42)
lr=LogisticRegression(C=1)
ensemble = VotingClassifier(estimators=[('rf', rf), ('dt', dt), ('lr', lr)], voting='hard')

# Loop over the splits of the data and train/test the models
for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the models to the training data for this fold
    rf.fit(X_train, y_train)
    dt.fit(X_train, y_train)
    lr.fit(X_train, y_train)

    # Fit the ensemble model to the training data for this fold
    ensemble.fit(X_train, y_train)

    # Evaluate the performance of the ensemble model on the test set for this fold
    y_pred = ensemble.predict(X_test)
    y_preds.extend(y_pred)
    y_tests.extend(y_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

# Calculate the mean and standard deviation of the cross-validation scores
mean_score = sum(scores) / len(scores)
std_dev = np.std(scores)

# Print the results
print("Ensemble accuracy: %0.4f (+/- %0.2f)" % (mean_score, std_dev * 2))


### ADABOOSTING

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier

# Set up KFold cross-validation
kf = KFold(n_splits=4, shuffle=True, random_state=42)

# Initialize empty lists to store scores and predictions
scores = []
y_preds = []
y_tests = []

# Train the models and ensemble them
rf = RandomForestClassifier(n_estimators=100)
dt = DecisionTreeClassifier(random_state=42)
ada = AdaBoostClassifier(estimator=dt, n_estimators=100)

# Loop over the splits of the data and train/test the models
for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the models to the training data for this fold
    rf.fit(X_train, y_train)
    dt.fit(X_train, y_train)
    ada.fit(X_train, y_train)

    # Combine the models into a voting classifier
    ensemble = VotingClassifier(estimators=[('rf', rf), ('ada', ada)], voting='hard')
    
    # Fit the ensemble model to the training data for this fold
    ensemble.fit(X_train, y_train)

    # Evaluate the performance of the ensemble model on the test set for this fold
    y_pred = ensemble.predict(X_test)
    y_preds.extend(y_pred)
    y_tests.extend(y_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)

# Calculate the mean and standard deviation of the cross-validation scores
mean_score = sum(scores) / len(scores)
std_dev = np.std(scores)

# Print the results
print("AdaBoost Ensemble accuracy: %0.4f (+/- %0.2f)" % (mean_score, std_dev * 2))