In [1]:
import pandas as pd 
import numpy as np
#modelling libraries 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  accuracy_score,precision_score,recall_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re  # noqa: F401  # used by TextPreprocessor.clean_text in another cell
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#Importing the classes from the init module 
from preprocessing import Explore,Clean,TextPreprocessor



In [2]:
df = pd.read_csv('Symptom2Disease.csv',index_col= False)
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [3]:
# Instantiate the Explore class and the clean class 
explore_df = Explore(df)
clean_df = Clean(df)

In [4]:
# Check the shape of the data
explore_df.shape()

----------------Shape of the Dataset---------------- 

(1200, 3)


In [5]:
#Check the columns of the data
explore_df.features()

----------------Features in the Dataset---------------- 

Index(['Unnamed: 0', 'label', 'text'], dtype='object')


In [6]:
#Check the summary statistics of the data
explore_df.stats()

----------------Summary Statistics of the Features---------------- 

        Unnamed: 0
count  1200.000000
mean    149.500000
std      86.638166
min       0.000000
25%      74.750000
50%     149.500000
75%     224.250000
max     299.000000


In [7]:
#Check the basic information of the data
explore_df.info()

----------------Dataset Overall Information---------------- 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1200 non-null   int64 
 1   label       1200 non-null   object
 2   text        1200 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.3+ KB
None


In [8]:
clean_df.missing_duplicated()


 Duplicated Rows:

- Total duplicated rows: 0 
 



Unnamed: 0,Missing Values,Percentage(%)
Unnamed: 0,0,0.0
label,0,0.0
text,0,0.0


Drop the unnecessary column from the dataframe ie 'Unnamed: 0'

In [9]:
df = df.drop(columns = ["Unnamed: 0"])
df.columns

Index(['label', 'text'], dtype='object')

In [11]:
df.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


In [12]:
preprocessor = TextPreprocessor()
data = preprocessor.preprocess(df,"text")

In [13]:
data.head()

Unnamed: 0,label,text,char_count,word_count,sentence_count,cleaned_text,tokenized_text,lemmatized_text,document
0,Psoriasis,I have been experiencing a skin rash on my arm...,141,28,3,i have been experiencing a skin rash on my arm...,"[experiencing, skin, rash, arms, legs, torso, ...","[experiencing, skin, rash, arm, leg, torso, pa...",experiencing skin rash arms legs torso past we...
1,Psoriasis,"My skin has been peeling, especially on my kne...",138,23,3,my skin has been peeling especially on my knee...,"[skin, peeling, especially, knees, elbows, sca...","[skin, peeling, especially, knee, elbow, scalp...",skin peeling especially knees elbows scalp pee...
2,Psoriasis,I have been experiencing joint pain in my fing...,148,28,3,i have been experiencing joint pain in my fing...,"[experiencing, joint, pain, fingers, wrists, k...","[experiencing, joint, pain, finger, wrist, kne...",experiencing joint pain fingers wrists knees p...
3,Psoriasis,"There is a silver like dusting on my skin, esp...",164,32,3,there is a silver like dusting on my skin espe...,"[silver, like, dusting, skin, especially, lowe...","[silver, like, dusting, skin, especially, lowe...",silver like dusting skin especially lower back...
4,Psoriasis,"My nails have small dents or pits in them, and...",140,27,3,my nails have small dents or pits in them and ...,"[nails, small, dents, pits, often, feel, infla...","[nail, small, dent, pit, often, feel, inflamma...",nails small dents pits often feel inflammatory...


In [14]:
data.columns

Index(['label', 'text', 'char_count', 'word_count', 'sentence_count',
       'cleaned_text', 'tokenized_text', 'lemmatized_text', 'document'],
      dtype='object')

## Modelling 

In [15]:
le = LabelEncoder()
data["label_code"] = le.fit_transform(data['label'])

In [16]:
data.columns

Index(['label', 'text', 'char_count', 'word_count', 'sentence_count',
       'cleaned_text', 'tokenized_text', 'lemmatized_text', 'document',
       'label_code'],
      dtype='object')

In [17]:
data["label"].value_counts()

label
Psoriasis                          50
Varicose Veins                     50
Typhoid                            50
Chicken pox                        50
Impetigo                           50
Dengue                             50
Fungal infection                   50
Common Cold                        50
Pneumonia                          50
Dimorphic Hemorrhoids              50
Arthritis                          50
Acne                               50
Bronchial Asthma                   50
Hypertension                       50
Migraine                           50
Cervical spondylosis               50
Jaundice                           50
Malaria                            50
urinary tract infection            50
allergy                            50
gastroesophageal reflux disease    50
drug reaction                      50
peptic ulcer disease               50
diabetes                           50
Name: count, dtype: int64

In [18]:
#View the target columns 
data[["label","label_code"]]

Unnamed: 0,label,label_code
0,Psoriasis,15
1,Psoriasis,15
2,Psoriasis,15
3,Psoriasis,15
4,Psoriasis,15
...,...,...
1195,diabetes,19
1196,diabetes,19
1197,diabetes,19
1198,diabetes,19


In [19]:
data1 = data.copy()

In [20]:
# data = data.drop(columns = ["label"])
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(data["document"])
y = data["label_code"]

In [None]:
import pickle 
# Save the vectorizer
with open("model/vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

In [22]:
X_train,y_train,X_test,y_test = train_test_split(X_vectorized,y,test_size = 0.2,random_state = 42)

## Logistic Regression Model

In [23]:


# Encode labels
le = LabelEncoder()
data["label_code"] = le.fit_transform(data['label'])

# Vectorize documents
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(data["document"])
y = data["label_code"]

# Correct order of train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, random_state=42
)

# Train Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)  # added max_iter for convergence
lr.fit(X_train, y_train)

# Predictions
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# Evaluation
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Train Accuracy: 1.0
Test Accuracy: 0.9766666666666667

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        12
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00        11
           4       0.80      1.00      0.89        12
           5       1.00      0.80      0.89        15
           6       1.00      0.93      0.96        14
           7       1.00      1.00      1.00        10
           8       1.00      1.00      1.00        14
           9       1.00      1.00      1.00        13
          10       1.00      1.00      1.00        13
          11       1.00      1.00      1.00        15
          12       1.00      1.00      1.00        14
          13       1.00      1.00      1.00        11
          14       1.00      1.00      1.00        15
          15       1.00      0.89      0.94         9
  

## Random Forest Classifier

In [24]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))



Train Accuracy: 1.0
Test Accuracy: 0.9766666666666667

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        12
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00        11
           4       0.80      1.00      0.89        12
           5       1.00      0.93      0.97        15
           6       0.81      0.93      0.87        14
           7       1.00      1.00      1.00        10
           8       1.00      1.00      1.00        14
           9       1.00      1.00      1.00        13
          10       1.00      1.00      1.00        13
          11       1.00      1.00      1.00        15
          12       1.00      1.00      1.00        14
          13       1.00      1.00      1.00        11
          14       1.00      1.00      1.00        15
          15       1.00      0.89      0.94         9
  

In [None]:

with open("model/model.pkl", "wb") as f:
    pickle.dump(rf, f)