In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

csv_path = "/content/drive/MyDrive/MACROBATT/FINAL_SURVIVOR_DEATH/SVM/DISAM/pretrained_zeroshot-disambiguation-results.csv"
Corpus = pd.read_csv(csv_path ,delimiter=',',encoding='latin-1')
Corpus.head()

Unnamed: 0,Filename,Text,ACTUAL,PREDICTED
0,28353604_df.csv,A 69-year-old man became aware of the onset of...,survivor,survivor
1,28559815_df.csv,"A 34-year-old woman, 1 week postpartum, presen...",survivor,survivor
2,28538413_df.csv,A 63-year-old male patient without smoking or ...,survivor,survivor
3,28353588_df.csv,A 75-year-old man was referred to our hospital...,survivor,death
4,28353596_df.csv,A 47-year-old female patient presented progres...,survivor,survivor


In [None]:
Corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Filename   200 non-null    object
 1   Text       200 non-null    object
 2   ACTUAL     200 non-null    object
 3   PREDICTED  200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB


In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# 1. Removing Blank Spaces
Corpus['Text'].dropna(inplace=True)
# 2. Changing all text to lowercase
Corpus['text_original'] = Corpus['Text']
Corpus['Text'] = [entry.lower() for entry in Corpus['Text']]
# 3. Tokenization-In this each entry in the corpus will be broken into set of words
Corpus['Text']= [word_tokenize(entry) for entry in Corpus['Text']]
# 4. Remove Stop words, Non-Numeric and perfoming Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

Corpus.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Filename,Text,ACTUAL,PREDICTED,text_original
0,28353604_df.csv,"[a, 69-year-old, man, became, aware, of, the, ...",survivor,survivor,A 69-year-old man became aware of the onset of...
1,28559815_df.csv,"[a, 34-year-old, woman, ,, 1, week, postpartum...",survivor,survivor,"A 34-year-old woman, 1 week postpartum, presen..."
2,28538413_df.csv,"[a, 63-year-old, male, patient, without, smoki...",survivor,survivor,A 63-year-old male patient without smoking or ...
3,28353588_df.csv,"[a, 75-year-old, man, was, referred, to, our, ...",survivor,death,A 75-year-old man was referred to our hospital...
4,28353596_df.csv,"[a, 47-year-old, female, patient, presented, p...",survivor,survivor,A 47-year-old female patient presented progres...


In [None]:
for index,entry in enumerate(Corpus['Text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [None]:
Corpus.drop(['Text'], axis=1)
output_path = '/content/drive/MyDrive/MACROBATT/FINAL_SURVIVOR_DEATH/SVM/DISAM/preprocessed_svm.csv'
Corpus.to_csv(output_path, index=False)

In [None]:
Corpus

Unnamed: 0,Filename,Text,ACTUAL,PREDICTED,text_original,text_final
0,28353604_df.csv,"[a, 69-year-old, man, became, aware, of, the, ...",survivor,survivor,A 69-year-old man became aware of the onset of...,"['man', 'become', 'aware', 'onset', 'exertiona..."
1,28559815_df.csv,"[a, 34-year-old, woman, ,, 1, week, postpartum...",survivor,survivor,"A 34-year-old woman, 1 week postpartum, presen...","['woman', 'week', 'postpartum', 'present', 'de..."
2,28538413_df.csv,"[a, 63-year-old, male, patient, without, smoki...",survivor,survivor,A 63-year-old male patient without smoking or ...,"['male', 'patient', 'without', 'smoking', 'dri..."
3,28353588_df.csv,"[a, 75-year-old, man, was, referred, to, our, ...",survivor,death,A 75-year-old man was referred to our hospital...,"['man', 'refer', 'hospital', 'evaluation', 'dy..."
4,28353596_df.csv,"[a, 47-year-old, female, patient, presented, p...",survivor,survivor,A 47-year-old female patient presented progres...,"['female', 'patient', 'present', 'progressivel..."
...,...,...,...,...,...,...
195,21477357_df.csv,"[a, 52-year-old, man, (, body, surface, area, ...",survivor,survivor,A 52-year-old man (body surface area: 1.3 m2) ...,"['man', 'body', 'surface', 'area', 'know', 'di..."
196,18258107_df.csv,"[here, ,, we, describe, another, case, in, a, ...",survivor,survivor,"Here, we describe another case in a 60-year-ol...","['describe', 'another', 'case', 'man', 'san', ..."
197,19860007_df.csv,"[a, 70-year-old, man, was, referred, to, our, ...",survivor,survivor,A 70-year-old man was referred to our hospital...,"['man', 'refer', 'hospital', 'gastric', 'cance..."
198,18236639_df.csv,"[a, 30-year-old, female, (, 65, kg, ), underwe...",survivor,survivor,A 30-year-old female (65 kg) underwent rhinopl...,"['female', 'kg', 'underwent', 'rhinoplasty', '..."


In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['ACTUAL'],test_size=0.2)

In [None]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [None]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

print(Tfidf_vect.vocabulary_)

{'man': 2333, 'become': 376, 'aware': 338, 'onset': 2692, 'exertional': 1450, 'dyspnea': 1268, 'july': 2169, 'history': 1853, 'smoke': 3887, 'year': 4983, 'cigarette': 624, 'per': 2841, 'day': 914, 'parent': 2797, 'brother': 477, 'lung': 2295, 'cancer': 506, 'past': 2809, 'medical': 2384, 'unremarkable': 4692, 'occupational': 2666, 'exposure': 1469, 'due': 1247, 'admit': 78, 'people': 2839, 'hospital': 1871, 'shunde': 3818, 'district': 1181, 'echocardiography': 1284, 'compute': 730, 'tomography': 4463, 'pulmonary': 3235, 'angiography': 190, 'indicate': 2001, 'severe': 3789, 'arterial': 264, 'hypertension': 1907, 'pressure': 3099, 'pap': 2781, 'right': 3580, 'heart': 1784, 'enlarge': 1361, 'ventricle': 4800, 'diameter': 1073, 'show': 3815, 'also': 138, 'thrombus': 4412, 'upper': 4705, 'lobe': 2271, 'artery': 266, 'however': 1877, 'region': 3418, 'affect': 95, 'embolism': 1331, 'consider': 772, 'small': 3885, 'induce': 2005, 'addition': 59, 'connective': 764, 'tissue': 4447, 'disease': 1

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  95.0


In [None]:
print(classification_report(Test_Y,predictions_SVM))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.95      1.00      0.97        38

    accuracy                           0.95        40
   macro avg       0.47      0.50      0.49        40
weighted avg       0.90      0.95      0.93        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
