In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("respiratory_clean.csv")
df = data.copy()

In [3]:
df.shape

(305, 63)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 63 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   sex                                 305 non-null    object
 1   age                                 305 non-null    int64 
 2   duration                            305 non-null    int64 
 3   cough                               305 non-null    int64 
 4   night_fever                         305 non-null    int64 
 5   catarrh                             305 non-null    int64 
 6   cold                                305 non-null    int64 
 7   tenderness                          305 non-null    int64 
 8   headache                            305 non-null    int64 
 9   black_stool                         305 non-null    int64 
 10  jerking_movement_of_the_hands       305 non-null    int64 
 11  unable_to_sit_unsupported           305 non-null    int64 

In [5]:
print("Unique values in sex column:", df['sex'].unique())

Unique values in sex column: ['M' 'F']


In [6]:
print("Unique values  of Diseases:", df['symptoms'].unique())

Unique values  of Diseases: ['MALARIA' 'URTI' 'HEAMORRHAGIC DISEASE' 'PERINATAL  ASPHYXIA'
 'FERNATAL ASPYHXIA' 'NNS' 'MACROSOMU BABY' 'RVD-RETRO VIRUS DISEASE'
 'GET SEPSIS' 'SEVERE MALARIA' 'HYPERPYREXIA' 'ACUTE URTI' 'PNEUMONIA'
 'BPN' 'DIARRHEA' 'DEHYDRATION' 'ASTHMA' 'POORLY TREATED MALARIA'
 'ACUTE ASTHMA' 'CARDIAC CASE' 'UNTREATED MALARIA' 'INGUINAL HERNIA'
 'ECZEMA' 'FUMCULOSIS' 'UNCOMPLICATED MALARIA'
 'PELVIC INFLAMMATORY DISEASE' 'HYPERTENSIVE HEART DISEASE' 'SEPSIS'
 'COUGH' 'HEUGRAGHIC TONGUE' 'ORAL THRUSH' 'AIRWAY OBSTRUCTION'
 'HYPERTENSION' 'MALARIA WITH PEPSIS' 'BRONCHOPNEUMONIA' 'MALARIAL'
 'E-FEVER' 'PEPTIC ULCER D2' 'CHRONIC KIDNEY DISEASE (CKD)'
 'RIGHT VENTRICULAR DYSPLASIA (RVD)' 'SALMONELLOSIS' 'UTI'
 'PYELONEPHRITIS' 'TONSILLITIS' 'SEVERE ANEMIA' 'HYDROCELL' 'PID'
 'PHERYGITIS' 'DELAYED SPEECH' 'OBSTRUCTIVE ADENOID' 'OBSTRUCTIVE AXIS']


In [7]:
print('Unique number of diseases:', df['symptoms'].nunique())
print('Unique number of diseases include:', df['symptoms'].value_counts())

Unique number of diseases: 51
Unique number of diseases include: symptoms
URTI                                 135
MALARIA                               61
UNCOMPLICATED MALARIA                  9
PNEUMONIA                              6
SEVERE MALARIA                         6
ACUTE URTI                             4
SEPSIS                                 4
UNTREATED MALARIA                      4
DIARRHEA                               4
UTI                                    3
PELVIC INFLAMMATORY DISEASE            3
HYPERPYREXIA                           2
HYPERTENSIVE HEART DISEASE             2
OBSTRUCTIVE ADENOID                    2
DELAYED SPEECH                         2
PHERYGITIS                             2
PID                                    2
TONSILLITIS                            2
MALARIA WITH PEPSIS                    2
HYPERTENSION                           2
GET SEPSIS                             2
ORAL THRUSH                            2
HEUGRAGHIC TONGUE       

In [8]:
if 'sex' in df.columns:
    df['sex'] = df['sex'].map({'M': 1, 'F': 0})

In [9]:
from sklearn.preprocessing import LabelEncoder

In [12]:
def le_encoder(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])
            
# Print the mapping for the current column
    print(f"Mapping for {col}:")
    for i, label in enumerate(label_encoder.classes_):
        print(f"  {i} -> {label}")

In [14]:
le_encoder(df)

Mapping for symptoms:
  0 -> ACUTE ASTHMA
  1 -> ACUTE URTI
  2 -> AIRWAY OBSTRUCTION
  3 -> ASTHMA
  4 -> BPN
  5 -> BRONCHOPNEUMONIA
  6 -> CARDIAC CASE
  7 -> CHRONIC KIDNEY DISEASE (CKD)
  8 -> COUGH
  9 -> DEHYDRATION
  10 -> DELAYED SPEECH
  11 -> DIARRHEA
  12 -> E-FEVER
  13 -> ECZEMA
  14 -> FERNATAL ASPYHXIA
  15 -> FUMCULOSIS
  16 -> GET SEPSIS
  17 -> HEAMORRHAGIC DISEASE
  18 -> HEUGRAGHIC TONGUE
  19 -> HYDROCELL
  20 -> HYPERPYREXIA
  21 -> HYPERTENSION
  22 -> HYPERTENSIVE HEART DISEASE
  23 -> INGUINAL HERNIA
  24 -> MACROSOMU BABY
  25 -> MALARIA
  26 -> MALARIA WITH PEPSIS
  27 -> MALARIAL
  28 -> NNS
  29 -> OBSTRUCTIVE ADENOID
  30 -> OBSTRUCTIVE AXIS
  31 -> ORAL THRUSH
  32 -> PELVIC INFLAMMATORY DISEASE
  33 -> PEPTIC ULCER D2
  34 -> PERINATAL  ASPHYXIA
  35 -> PHERYGITIS
  36 -> PID
  37 -> PNEUMONIA
  38 -> POORLY TREATED MALARIA
  39 -> PYELONEPHRITIS
  40 -> RIGHT VENTRICULAR DYSPLASIA (RVD)
  41 -> RVD-RETRO VIRUS DISEASE
  42 -> SALMONELLOSIS
  43 -> SEPS

In [15]:
df.head(2)

Unnamed: 0,sex,age,duration,cough,night_fever,catarrh,cold,tenderness,headache,black_stool,...,vaginal_discharge,vaginal_itching,stooling,running_nose,club_pain,bleeding_from_the_nose,unable_to_speak,mouth_breathing,sleep_apneas,symptoms
0,1,13,7,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,25
1,1,13,7,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,49


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X=df.drop(columns='symptoms', axis=1)
y=df['symptoms']

In [18]:
#Spliting the datset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(244, 62) (61, 62) (244,) (61,)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay,\
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve,confusion_matrix

In [22]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    classification = classification_report(true, predicted)
    return accuracy, classification

In [23]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(criterion='entropy', random_state=0),
    "Random Forest Classifiier": RandomForestClassifier(n_estimators= 10, criterion="entropy"),
    "SVM": svm.SVC(kernel='linear')
}
model_list = []
acc_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy, model_train_classification = evaluate_model(y_train, y_train_pred)

    model_test_accuracy , model_test_classification = evaluate_model(y_test, y_test_pred)
    
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy Score: {:.4f}".format(model_train_accuracy))
    print("- Classification Report:")
    print(model_train_classification)

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Accuracy Score: {:.4f}".format(model_test_accuracy))
    print("- Classification Report:")
    print(model_test_classification)
    
    acc_list.append(model_test_accuracy)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy Score: 0.5000
- Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         1
          18       0.00      0

In [24]:
pd.DataFrame(list(zip(model_list, acc_list)), columns=['Model Name', 'Accuracy_Score']).sort_values(by=["Accuracy_Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy_Score
1,Gradient Boosting Classifier,0.442623
3,Random Forest Classifiier,0.442623
2,Decision Tree Classifier,0.377049
4,SVM,0.377049
0,Logistic Regression,0.344262


## Using the best Model

In [25]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.01, max_depth=5)
gbc.fit(X_train, y_train)
y_predg = gbc.predict(X_test)
print("Accuracy Score: ", round(accuracy_score(y_test,y_predg)*100, 2), "%")
print("Classification Report: \n", classification_report(y_test,y_predg))

Accuracy Score:  49.18 %
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         1
           6       1.00      1.00      1.00         1
           7       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          18       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         0
          24       0.50      1.00      0.67         1
          25       0.33      0.11      0.17         9
          28       0.00      0.00      0.00         1
          29       1.00      1.

## Testing the Model

In [27]:
# Mapping of prediction values to diagnoses
diagnosis_mapping = {
    0: "ACUTE ASTHMA",
    1: "ACUTE URTI",
    2: "AIRWAY OBSTRUCTION",
    3: "ASTHMA",
    4: "BPN",
    5: "BRONCHOPNEUMONIA",
    6: "CARDIAC CASE",
    7: "CHRONIC KIDNEY DISEASE (CKD)",
    8: "COUGH",
    9: "DEHYDRATION",
    10: "DELAYED SPEECH",
    11: "DIARRHEA",
    12: "E-FEVER",
    13: "ECZEMA",
    14: "FERNATAL ASPYHXIA",
    15: "FUMCULOSIS",
    16: "GET SEPSIS",
    17: "HEAMORRHAGIC DISEASE",
    18: "HEUGRAGHIC TONGUE",
    19: "HYDROCELL",
    20: "HYPERPYREXIA",
    21: "HYPERTENSION",
    22: "HYPERTENSIVE HEART DISEASE",
    23: "INGUINAL HERNIA",
    24: "MACROSOMU BABY",
    25: "MALARIA",
    26: "MALARIA WITH PEPSIS",
    27: "MALARIAL",
    28: "NNS",
    29: "OBSTRUCTIVE ADENOID",
    30: "OBSTRUCTIVE AXIS",
    31: "ORAL THRUSH",
    32: "PELVIC INFLAMMATORY DISEASE",
    33: "PEPTIC ULCER D2",
    34: "PERINATAL ASPHYXIA",
    35: "PHERYGITIS",
    36: "PID",
    37: "PNEUMONIA",
    38: "POORLY TREATED MALARIA",
    39: "PYELONEPHRITIS",
    40: "RIGHT VENTRICULAR DYSPLASIA (RVD)",
    41: "RVD-RETRO VIRUS DISEASE",
    42: "SALMONELLOSIS",
    43: "SEPSIS",
    44: "SEVERE ANEMIA",
    45: "SEVERE MALARIA",
    46: "TONSILLITIS",
    47: "UNCOMPLICATED MALARIA",
    48: "UNTREATED MALARIA",
    49: "URTI",
    50: "UTI"
}

In [29]:
input_data = (1,13,7,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1)

# convert input data into numpy array
input_data_as_numpy = np.asarray(input_data)

#reshape the array since we are predicting for one instance
input_data_reshape = input_data_as_numpy.reshape(1,-1)

#making prediction
prediction = gbc.predict(input_data_reshape)
print (prediction)

# Get the diagnosis based on the prediction
predicted_diagnosis = diagnosis_mapping.get(prediction[0], "Unknown Diagnosis")
print(f"This patient has been diagnosed with: {predicted_diagnosis}")

[6]
This patient has been diagnosed with: CARDIAC CASE


In [30]:
import pickle

In [39]:
filename = 'trained_model.pkl'
pickle.dump(gbc, open(filename, 'wb'))

In [40]:
# loading the saved model

loaded_model = pickle.load(open('trained_model.pkl', 'rb'))

In [37]:
input_data = (0,40,10,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1)

# convert input data into numpy array
input_data_as_numpy = np.asarray(input_data)

#reshape the array since we are predicting for one instance
input_data_reshape = input_data_as_numpy.reshape(1,-1)

#making prediction
prediction = gbc.predict(input_data_reshape)
print (prediction)

# Get the diagnosis based on the prediction
predicted_diagnosis = diagnosis_mapping.get(prediction[0], "Unknown Diagnosis")
print(f"This patient has been diagnosed with: {predicted_diagnosis}")

[6]
This patient has been diagnosed with: CARDIAC CASE


In [38]:
for column in X.columns:
    print(column)

sex
age
duration
cough
night_fever
catarrh
cold
tenderness
headache
black_stool
jerking_movement_of_the_hands
unable_to_sit_unsupported
throat_fever
fast_breathing
body_rashes
cough_sills
frequent_stooling
chest_pain
frequent_vomiting
sleep_disturbance_at_night
nasal_stuffiness
excessive_sweating
chronic_cough
profuse_nasal_discharge
difficulty_in_breathing
itching_ear
frequent_urination
swelling_of_pubic_area_right_sided
generalized_body_rashes
painful_swallowing
passage_of_freight
poor_vision
frequent_micturition
watery_of_the_tongue
poor_appetite
noisy_breathing
pepsis_swells_up_since_birth
body_weakness
flame_pain
swollen_leg
abdominal_pain
excessive_salivation
crawling_sensation
weight_loss
g.b_pain
stomach_ache
tysing_of_the_right_ear
refusal_of_food
waist_pain
snoring
knee_pain
dizziness
scrotal_swelling
vaginal_discharge
vaginal_itching
stooling
running_nose
club_pain
bleeding_from_the_nose
unable_to_speak
mouth_breathing
sleep_apneas
