# Medical Recognition

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Reading the dataset

In [None]:
data = pd.read_csv('/content/drive/MyDrive/IE/NLP/Final_Synthetic_Dataset.tsv', sep='\t')

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
data['patient_comment'] = data['patient_comment'].astype(str)
data['identified_disease'] = data['identified_disease'].astype(str)
data = data.reset_index(drop=True)

# Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encode labels
label_encoder = LabelEncoder()
data['identified_disease'] = label_encoder.fit_transform(data['identified_disease'])


In [None]:
import joblib
joblib.dump(label_encoder,'label_encoder.pkl')

['label_encoder.pkl']

# Model Training and pipeline creation

In [None]:
import pandas as pd
import spacy
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Loading Spacy
nlp = spacy.load("en_core_web_sm")


class KeywordExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        #Pandas Series
        if isinstance(X, list):
            X = pd.Series(X)
        return X.apply(self.extract_keywords)

    def extract_keywords(self, text):
        doc = nlp(text)
        keywords = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]
        return ' '.join(keywords)


# Defining X,y
X = data['patient_comment']
y = data['identified_disease']

# Data Spliting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating Pipeline
pipeline = Pipeline([
    ('keywords', KeywordExtractor()),
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Prediction
y_pred = pipeline.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

# Use case
new_symptoms = ['Patient year office worker complaints headache']
predicted_disease = pipeline.predict(new_symptoms)

print(f'Predicted Disease: {predicted_disease[0]}')


Accuracy: 0.99775
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       318
           1       1.00      1.00      1.00        47
           2       1.00      1.00      1.00       222
           3       1.00      1.00      1.00        11
           4       1.00      1.00      1.00        38
           5       0.00      0.00      0.00         1
           6       1.00      1.00      1.00      2531
           7       1.00      1.00      1.00        44
           8       1.00      1.00      1.00        26
           9       1.00      1.00      1.00        12
          10       0.98      1.00      0.99        43
          11       0.00      0.00      0.00         1
          12       1.00      1.00      1.00        12
          13       0.00      0.00      0.00         1
          14       1.00      1.00      1.00       418
          15       0.00      0.00      0.00         1
          16       0.81      1.00      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
data.columns

Index(['hospital_name', 'hospital_id', 'longitude', 'latitude',
       'identified_disease', 'patient_id', 'patient_comment', 'symptoms',
       'reported_timestamp', 'month'],
      dtype='object')

# Exporting pipeline

In [None]:
joblib.dump(pipeline, '/content/drive/MyDrive/IE/NLP/disease_prediction_pipeline.pkl')

['/content/drive/MyDrive/IE/NLP/disease_prediction_pipeline.pkl']

In [None]:
pip freeze > requirements.txt


In [None]:
import joblib

# loading the model
pipeline = joblib.load('/content/drive/MyDrive/IE/NLP/disease_prediction_pipeline.pkl')

# descriptions
new_symptoms = ["Patient, 35-year-old female, presents with fatigue and weakness that have been progressively worsening over the past few months. She reports feeling lightheaded when standing up quickly and shortness of breath with minimal exertion. Additionally, she has been experiencing heart palpitations and occasional dizziness. Upon further questioning, she mentions a history of heavy menstrual periods for the last year but states that they have become even heavier in the past few months. She denies any recent weight loss or changes in appetite. Family history is significant for anemia in her mother. On physical examination, patient appears pale with conjunctival pallor noted. Vital signs reveal tachycardia and low blood pressure when standing up from sitting position (postural hypotension)"]

# predictions
predicted_disease = pipeline.predict(new_symptoms)

predicted_disease = label_encoder.inverse_transform(predicted_disease)

print(f'Predicted Disease: {predicted_disease}')


Predicted Disease: ['Anemia']


In [None]:
data.loc[1999,'patient_comment']

'Patient, 35-year-old female, presents with fatigue and weakness that have been progressively worsening over the past few months. She reports feeling lightheaded when standing up quickly and shortness of breath with minimal exertion. Additionally, she has been experiencing heart palpitations and occasional dizziness.\n\nUpon further questioning, she mentions a history of heavy menstrual periods for the last year but states that they have become even heavier in the past few months. She denies any recent weight loss or changes in appetite. Family history is significant for anemia in her mother.\n\nOn physical examination, patient appears pale with conjunctival pallor noted. Vital signs reveal tachycardia and low blood pressure when standing up from sitting position (postural hypotension'

In [None]:
data.loc[1999,'identified_disease']

1