*INSTALLING DEPENDANCIES AND IMPORTING LIBRARIES*

In [None]:
!pip install sentence_transformers
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score




*UPLOADING DRIVE AND READING DATASET*

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Symptom2Disease.csv')

*DATA EXPLARATORY*

In [None]:
#checking the first 5 rows of the dataset
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [None]:
#checking the 5 last row of the dataset
df.tail()

Unnamed: 0.1,Unnamed: 0,label,text
1195,295,diabetes,I'm shaking and trembling all over. I've lost ...
1196,296,diabetes,"Particularly in the crevices of my skin, I hav..."
1197,297,diabetes,I regularly experience these intense urges and...
1198,298,diabetes,"I have trouble breathing, especially outside. ..."
1199,299,diabetes,I constantly sneeze and have a dry cough. My i...


In [None]:
#checking the shape of the dataset
df.shape

(1200, 3)

In [None]:
#Information about the DataFrame, including the data types and non-null values
df.describe()

Unnamed: 0.1,Unnamed: 0
count,1200.0
mean,149.5
std,86.638166
min,0.0
25%,74.75
50%,149.5
75%,224.25
max,299.0


In [None]:
#display the information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1200 non-null   int64 
 1   label       1200 non-null   object
 2   text        1200 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.2+ KB


In [None]:
#checking if the dataset  has null values
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
dtype: int64

In [None]:
df.iloc[0,1]

'Psoriasis'

*DATA CLEANING*

In [None]:
df = df.drop('Unnamed: 0', axis=1)


In [None]:
df.head()

Unnamed: 0,label,text
0,Psoriasis,I have been experiencing a skin rash on my arm...
1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,Psoriasis,I have been experiencing joint pain in my fing...
3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,Psoriasis,"My nails have small dents or pits in them, and..."


*FEATURE ENGINEERING*


In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
features = model.encode(df.iloc[:,1])

In [None]:
import pickle
pickle.dump(model, open('sentence_encoding.sav', 'wb'))

*COSINE SIMILARITY*

In [None]:
test_0 = model.encode(df.iloc[0,1]).reshape(1,-1)
test_0.shape

(1, 384)

In [None]:
test_1 = model.encode(df.iloc[444,1]).reshape(1,-1)
test_1.shape

(1, 384)

In [None]:
metrics.pairwise.cosine_similarity(test_0, test_1)

array([[0.18148915]], dtype=float32)

*CREATING LABELS AND TARGETS FOR CLASSIFICATION*

In [None]:
r_targets = df.iloc[:,1].values

In [None]:
le = LabelEncoder()
targets = le.fit_transform(r_targets)

In [None]:
pickle.dump(le, open('label_encoder.sav', 'wb'))

In [None]:
features.shape

(1200, 384)

*TRAIN TEST SPLITTING*

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, train_size=0.75)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(900, 384)
(300, 384)
(900,)
(300,)


*MODEL*

In [None]:
clf = MLPClassifier(max_iter=1000)
clf.fit(X_train, y_train)

In [None]:

clf_knn = KNeighborsClassifier(n_neighbors=3)
clf_knn.fit(X_train, y_train)


In [None]:
clf_svm = svm.SVC()
clf_svm.fit(X_train, y_train)


# KNN

## Model Evaluation

In [None]:
clf.score(X_train,y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
clf_svm.score(X_train,y_train)

In [None]:
clf_svm.score(X_test, y_test)

In [None]:
clf_knn.score(X_train,y_train)

In [None]:
clf_knn.score(X_test, y_test)

In [62]:
#Saving the model to a pickle file
pickle.dump(clf, open('classification_model.sav', 'wb'))

*CREATING A PIPELINE*

In [63]:
def disease_classification(symptom_text):
    model = pickle.load(open('sentence_encoding.sav', 'rb'))
    class_model = pickle.load(open('classification_model.sav', 'rb'))
    label_encoder = pickle.load(open('label_encoder.sav', 'rb'))

    temp_encoding = model.encode(symptom_text)
    temp_prediction = class_model.predict([temp_encoding])
    temp_label = label_encoder.inverse_transform(temp_prediction)

    return temp_label[0]

In [65]:
symptom_text = 'Dry, thick, and raised patches on the skin are the most common sign of psoriasis. These patches are often covered with a silvery-white coating called scale, and they tend to itch.'

In [66]:
disease_classification(symptom_text)

'Psoriasis'