In [3]:
#import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

In [4]:
data = pd.read_csv("datasets/Symptom2Disease.csv")
description_data = pd.read_csv("datasets/disease_description.csv")
precaution_data = pd.read_csv("datasets/disease_precautions.csv")


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1200 non-null   int64 
 1   label       1200 non-null   object
 2   text        1200 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.3+ KB


In [6]:
data.describe(include='object')

Unnamed: 0,label,text
count,1200,1200
unique,24,1153
top,Psoriasis,"I've been feeling extremely scratchy, sick, an..."
freq,50,4


In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [8]:
diseases= data["label"].unique()
diseases

array(['Psoriasis', 'Varicose Veins', 'Typhoid', 'Chicken Pox',
       'Impetigo', 'Dengue', 'Fungal Infection', 'Common Cold',
       'Pneumonia', 'Dimorphic Hemorrhoids', 'Arthritis', 'Acne',
       'Bronchial Asthma', 'Hypertension', 'Migraine',
       'Cervical Spondylosis', 'Jaundice', 'Malaria',
       'Urinary Tract Infection', 'Allergy',
       'Gastroesophageal Reflux Disease', 'Drug Reaction',
       'Peptic Ulcer Disease', 'Diabetes'], dtype=object)

In [9]:
data["text"][0]

'I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.'

In [10]:
data['label'].value_counts()

label
Psoriasis                          50
Varicose Veins                     50
Typhoid                            50
Chicken Pox                        50
Impetigo                           50
Dengue                             50
Fungal Infection                   50
Common Cold                        50
Pneumonia                          50
Dimorphic Hemorrhoids              50
Arthritis                          50
Acne                               50
Bronchial Asthma                   50
Hypertension                       50
Migraine                           50
Cervical Spondylosis               50
Jaundice                           50
Malaria                            50
Urinary Tract Infection            50
Allergy                            50
Gastroesophageal Reflux Disease    50
Drug Reaction                      50
Peptic Ulcer Disease               50
Diabetes                           50
Name: count, dtype: int64

In [11]:
num_of_classes = data['label'].nunique()
num_of_classes

24

In [12]:
data.isnull().sum()

Unnamed: 0    0
label         0
text          0
dtype: int64

## Cleaning Data

In [22]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arsha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\arsha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arsha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
stop_words = set(stopwords.words("english"))

In [15]:
def clean_text(sent):
    #remove punctuations
    sent = sent.translate(str.maketrans('','',string.punctuation)).strip()

    #remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sent)
    words = [word for word in words if word not in stop_words]

    return " ".join(words).lower()

In [23]:
data["text"] = data["text"].apply(clean_text)

In [18]:
data

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."
...,...,...,...
1195,295,Diabetes,I'm shaking and trembling all over. I've lost ...
1196,296,Diabetes,"Particularly in the crevices of my skin, I hav..."
1197,297,Diabetes,I regularly experience these intense urges and...
1198,298,Diabetes,"I have trouble breathing, especially outside. ..."


In [25]:
# Removing some words to filter the data
words_to_remove = ["i", "ive", "my", "im"]

for word_to_remove in words_to_remove:
    data['text'] = data['text'].str.replace(r'\b{}\b'.format(word_to_remove), '', regex=True)

In [26]:
data

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,experiencing skin rash arms legs torso past w...
1,1,Psoriasis,skin peeling especially knees elbows scalp th...
2,2,Psoriasis,experiencing joint pain fingers wrists knees ...
3,3,Psoriasis,there silver like dusting skin especially lowe...
4,4,Psoriasis,nails small dents pits often feel inflammator...
...,...,...,...
1195,295,Diabetes,shaking trembling lost sense taste smell ex...
1196,296,Diabetes,particularly crevices skin skin rashes irrita...
1197,297,Diabetes,regularly experience intense urges want urina...
1198,298,Diabetes,trouble breathing especially outside start f...


In [27]:
x_train,x_test,y_train,y_test = train_test_split(data["text"], data["label"], 
test_size=0.2, random_state=42)

In [28]:
tfidf_vectorizer =  TfidfVectorizer(max_features=1500)

In [29]:
tfidf_train =  tfidf_vectorizer.fit_transform(x_train).toarray()
tfidf_test = tfidf_vectorizer.transform(x_test).toarray()

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(tfidf_train, y_train)

In [30]:
# Initializing classifier
svm = SVC(kernel='linear')  # You can choose different kernels such as 'rbf' or 'poly'

In [31]:
svm.fit(tfidf_train, y_train)

In [32]:
predictions = svm.predict(tfidf_test)

In [33]:
def report(y_test,predictions):
    """Function to create classification report"""
    accuracy = accuracy_score(y_test, predictions)
    print(f'Accuracy: {accuracy:.2f}')
    print(classification_report(y_test, predictions))

In [34]:
report(y_test,predictions)

Accuracy: 0.99
                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                        Allergy       1.00      1.00      1.00        12
                      Arthritis       1.00      1.00      1.00        10
               Bronchial Asthma       1.00      1.00      1.00        11
           Cervical Spondylosis       1.00      1.00      1.00         7
                    Chicken Pox       0.92      0.92      0.92        12
                    Common Cold       1.00      1.00      1.00        12
                         Dengue       0.92      0.92      0.92        12
                       Diabetes       1.00      1.00      1.00         8
          Dimorphic Hemorrhoids       1.00      1.00      1.00         7
                  Drug Reaction       1.00      1.00      1.00         5
               Fungal Infection       1.00      1.00      1.00        13
Gastroesophageal Reflux Disease    

In [35]:
def svm_output(model,text):
    text =clean_text(text)
    tfidf_text = tfidf_vectorizer.transform([text]).toarray()
    disease = model.predict(tfidf_text)
    return disease[0]

In [36]:
symp1="My eyes are red and itchy, and my nose feels all stuffy and congested"
svm_output(svm,symp1)

'Common Cold'

In [37]:
symp2="eyes are red and itchy"
svm_output(svm,symp2)

'Allergy'

In [38]:
symp3="Fever and Cough with phlegm Shortness of breath Chest pain Fatigue"
svm_output(svm,symp3)

'Pneumonia'