## In this notebook, we will test the models that take a new input and predict/classify the disease

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
import re
import spacy
import numpy 
from numpy import dot
from numpy.linalg import norm
nlp = spacy.load('en_core_web_sm')
import pickle 

### Read the Data & pickle files

In [8]:
# reading the stop words list with pickle
with open ('stop_words.ob', 'rb') as fp:
    domain_stop_word = pickle.load(fp)

In [10]:
file_path = "diseases_with_description.csv"
all_chapters = pd.read_csv(file_path)

### Word Embedding

In [11]:
cv = CountVectorizer(stop_words="english")
cv_tfidf = TfidfVectorizer(stop_words="english")

X = cv.fit_transform(list(all_chapters.loc[:, 'Description' ]))
X_tfidf = cv_tfidf.fit_transform(list(all_chapters.loc[:, 'Description' ]))

df_cv = pd.DataFrame(X.toarray() , columns=cv.get_feature_names())
df_tfidf = pd.DataFrame(X_tfidf.toarray() , columns=cv_tfidf.get_feature_names())



### First-Step: Cosine Similarity

In [16]:
cosine = lambda v1 , v2 : dot(v1 , v2) / (norm(v1) * norm(v2))

new_text = ["dizziness loss of balance  vomiting tinnitus of hearing in the high frequency range in one ear difficulty focusing your eyes "]
new_text_cv = cv.transform(new_text).toarray()[0]
new_text_tfidf = cv_tfidf.transform(new_text).toarray()[0]

for chpter_number in range(int(all_chapters.shape[0])):
    print(f"This is chpter number : {chpter_number} ")
    print(f"Cosin cv :    { cosine( df_cv.iloc[chpter_number]  , new_text_cv )} ")
    print(f"Cosin TFIDF : { cosine( df_tfidf.iloc[chpter_number]  , new_text_tfidf) } ")


This is chpter number : 0 
Cosin cv :    0.08085577841914385 
Cosin TFIDF : 0.07214673217715502 
This is chpter number : 1 
Cosin cv :    0.11112511112516668 
Cosin TFIDF : 0.106959402823985 
This is chpter number : 2 
Cosin cv :    0.0 
Cosin TFIDF : 0.0 


**As we can see from the previous output, we test sentences **(dizziness loss of balance  vomiting tinnitus of hearing in the high frequency range in one ear difficulty focusing your eyes)** which is a symptoms of disease.**

***So, it's clear to see that the highest score is in chapter **Two** which is **Ear & Nose**. Then we will go with classification model of chapter two, to classify the new input and got the disease name***

### Second-Step: Predict the same new input, in the trained **nar_nose_model**

In [20]:
file_path = "ch13_label_sub_m1.csv"
df = pd.read_csv(file_path , names=["Label", "Disease"])

In [21]:
def clean_text_func(text):
    text = str(text)
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"[0-9]", " ", text)
    final_text = ""
    for x in text.split():
        if x not in domain_stop_word:
            final_text = final_text + x  +" "
    return final_text

df['Disease'] = df['Disease'].apply(lambda x: clean_text_func(x))
df.head()

Unnamed: 0,Label,Disease
0,OTITIS EXTERNA,otitis externa characteristically produces exa...
1,BENIGN TUMORS OF THE EAR CANAL,ear tumor unless becomes case signof tumor tum...
2,OTITIS MEDIA,features suppurative otitis media throbbing be...
3,MASTOIDITIS,features ache tenderness mastoid process grade...
4,OTOSCLEROSIS,bone otic capsule immobilizes footplate normal...


In [25]:
X_train = df.Disease
y_train = df.Label

cv1 = CountVectorizer()
X_train_cv1 = cv1.fit_transform(X_train)
pd_cv1 = pd.DataFrame(X_train_cv1.toarray(), columns=cv1.get_feature_names())

nar_nose_model_lr = LogisticRegression()
nar_nose_model_lr.fit(X_train_cv1, y_train)



LogisticRegression()

In [26]:
X_test = "Difficulty sleeping or staying asleep Fever Fluid draining from ear  Loss of balance. Hearing difficulties. Ear pain"
cleaned_text = clean_text_func(X_test)

X_test_cv3  = cv1.transform([cleaned_text])
y_pred_cv3 = nar_nose_model_lr.predict(X_test_cv3)

In [27]:
print(y_pred_cv3)
disease_name = y_pred_cv3

['OTITIS MEDIA ']
