In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
dataset = pd.read_csv("Symptom2Disease.csv")
dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1200):
  review = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
print(corpus)

['experienc skin rash arm leg torso past week red itchi cover dri scali patch', 'skin peel especi knee elbow scalp peel often accompani burn sting sensat', 'experienc joint pain finger wrist knee pain often achi throb get wors move joint', 'silver like dust skin especi lower back scalp dust made small scale flake easili scratch', 'nail small dent pit often feel inflammatori tender touch even minor rash arm', 'skin palm sole thicken deep crack crack pain bleed easili', 'skin around mouth nose eye red inflam often itchi uncomfort notic inflamm nail', 'skin sensit react easili chang temperatur humid often care product use skin', 'notic sudden peel skin differ part bodi mainli arm leg back also face sever joint pain skin rash', 'skin genit red inflam often itchi burn uncomfort rash differ part bodi', 'experienc fatigu gener feel malais often feel tire lack energi even good night sleep', 'rash skin spread part bodi includ chest abdomen itchi uncomfort often wors night also face skin peel', 

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
#from sklearn.preprocessing import LabelEncoder
#le = LabelEncoder()
#y = le.fit_transform(y)

In [6]:
print(X)
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['Psoriasis' 'Psoriasis' 'Psoriasis' ... 'diabetes' 'diabetes' 'diabetes']


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [8]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [9]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['Chicken pox' 'Chicken pox']
 ['Fungal infection' 'Fungal infection']
 ['Psoriasis' 'Psoriasis']
 ['Dengue' 'Dengue']
 ['Bronchial Asthma' 'Bronchial Asthma']
 ['Arthritis' 'Arthritis']
 ['Chicken pox' 'Chicken pox']
 ['allergy' 'allergy']
 ['Typhoid' 'peptic ulcer disease']
 ['drug reaction' 'drug reaction']
 ['Fungal infection' 'Fungal infection']
 ['drug reaction' 'drug reaction']
 ['Fungal infection' 'Fungal infection']
 ['Common Cold' 'Common Cold']
 ['Common Cold' 'Common Cold']
 ['Pneumonia' 'Pneumonia']
 ['allergy' 'allergy']
 ['Impetigo' 'Impetigo']
 ['Varicose Veins' 'Varicose Veins']
 ['drug reaction' 'drug reaction']
 ['allergy' 'allergy']
 ['allergy' 'allergy']
 ['drug reaction' 'drug reaction']
 ['drug reaction' 'drug reaction']
 ['allergy' 'allergy']
 ['Psoriasis' 'Psoriasis']
 ['Migraine' 'Migraine']
 ['Impetigo' 'Impetigo']
 ['Malaria' 'Malaria']
 ['peptic ulcer disease' 'peptic ulcer disease']
 ['Psoriasis' 'Psoriasis']
 ['Psoriasis' 'Psoriasis']
 ['Jaundice' 'Jaund

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[13  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  9  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  9  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  8  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0

0.9666666666666667

In [11]:
'''
test_text = ['I have a red eyes, runny nose. I am having cough and cold as well']
#test_text = ['My stomach hurts and I feel nauseous. I have diarrhea and I feel weak.']
#test_text = ['red, scaly patches on the skin, itching, dryness, cracking, and bleeding of the skin, swollen and stiff joints, and nail abnormalities.']

print(f"The symptom is: \n {test_text[0]}")
X1 = cv.transform(test_text).toarray()
y_pred = classifier.predict(X1)
print('The possible disease is: ', y_pred)
'''

'\ntest_text = [\'I have a red eyes, runny nose. I am having cough and cold as well\']\n#test_text = [\'My stomach hurts and I feel nauseous. I have diarrhea and I feel weak.\']\n#test_text = [\'red, scaly patches on the skin, itching, dryness, cracking, and bleeding of the skin, swollen and stiff joints, and nail abnormalities.\']\n\nprint(f"The symptom is: \n {test_text[0]}")\nX1 = cv.transform(test_text).toarray()\ny_pred = classifier.predict(X1)\nprint(\'The possible disease is: \', y_pred)\n'

In [12]:
import pickle
pickle.dump(classifier,open('model1.pkl','wb'))

In [13]:
with open('CountVectorizer.pkl', 'wb') as file:
    pickle.dump(cv, file)