# Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
!cp /content/drive/MyDrive/Assignment2.zip /content # reading data from drive

In [2]:
%%capture
!unzip /content/Assignment2.zip -d dataset # mounting data

# Task 1
Extract the ‘text’ in all the CSV files and store them into a single ‘.txt file’.

In [3]:
import pandas as pd
import os

# Folder path for csv files
folder_path = '/content/dataset/csv'

# output folder path for saving extracted text file from csv
output_file = '/content/drive/MyDrive/Assignment2/combined_text.txt'

# read csv files
with open(output_file, 'w', encoding='utf-8') as txt_file:
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")
            try:
                df = pd.read_csv(file_path)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue

            # Printing the column names to identify the text column in all csv files
            print(f"Columns in {filename}: {df.columns.tolist()}")

print("Inspection complete")


Processing file: /content/dataset/csv/CSV1.csv
Columns in CSV1.csv: ['Unnamed: 0', 'HADM_ID', 'SHORT-TEXT', 'ICD9_CODE', 'ICD9', 'Label']
Processing file: /content/dataset/csv/CSV2.csv
Columns in CSV2.csv: ['Unnamed: 0', 'HADM_ID', 'TEXT', 'LABLE', 'entites', 'group']
Processing file: /content/dataset/csv/CSV3.csv
Columns in CSV3.csv: ['HADM_ID', 'TEXT', 'ICD9_CODE', 'Label']
Processing file: /content/dataset/csv/CSV4.csv
Columns in CSV4.csv: ['HADM_ID', 'TEXT', 'LABLE']
Inspection complete


In [4]:
import pandas as pd
import os
folder_path = '/content/dataset/csv'
output_file = '/content/drive/MyDrive/Assignment2/combined_text.txt' #output folder for save .txt file in directory
total_texts = 0
with open(output_file, 'w', encoding='utf-8') as txt_file:
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            print(f"Processing file: {file_path}")
            try:
                df = pd.read_csv(file_path)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue
            if 'TEXT' in df.columns:
                text_column = 'TEXT'
            elif 'SHORT-TEXT' in df.columns:
                text_column = 'SHORT-TEXT'
            else:
                print(f"No relevant text column found in {filename}")
                continue
            for text in df[text_column]:
                txt_file.write(f"{text}\n")
                total_texts += 1

print(f"All text combined into {output_file}")
print(f"Total number of texts written: {total_texts}")


Processing file: /content/dataset/csv/CSV1.csv
Processing file: /content/dataset/csv/CSV2.csv
Processing file: /content/dataset/csv/CSV3.csv
Processing file: /content/dataset/csv/CSV4.csv
All text combined into /content/drive/MyDrive/Assignment2/combined_text.txt
Total number of texts written: 69582


# Task:2 Research
Install the libraries(SpaCy – scispaCy – ‘en_core_sci_sm’/’en_ner_bc5cdr_md’).
Install the libraries (Transformers (Hugging Face) - and any bio-medical model
(BioBert) that can detect drugs, diseases, etc from the text).

In [3]:
!pip install torch==2.4.0



In [4]:
!pip install torch==2.0.0+cu117 torchvision==0.15.0+cu117 --index-url https://download.pytorch.org/whl/cu117


Looking in indexes: https://download.pytorch.org/whl/cu117
Collecting torch==2.0.0+cu117
  Downloading https://download.pytorch.org/whl/cu117/torch-2.0.0%2Bcu117-cp310-cp310-linux_x86_64.whl (1843.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 GB[0m [31m?[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.15.0+cu117
  Downloading https://download.pytorch.org/whl/cu117/torchvision-0.15.0%2Bcu117-cp310-cp310-linux_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.0.0 (from torch==2.0.0+cu117)
  Downloading https://download.pytorch.org/whl/triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (63.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting cmake (from triton==2.0.0->torch==2.0.0+cu117)
  Downloading https://download.pytorch.org/wh

In [5]:
!pip install spacy
!pip install transformers



In [6]:
!pip install scispacy


Collecting scispacy
  Downloading scispacy-0.5.4-py3-none-any.whl.metadata (16 kB)
Collecting scipy<1.11 (from scispacy)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m905.9 kB/s[0m eta [36m0:00:00[0m
Collecting conllu (from scispacy)
  Downloading conllu-5.0.1-py3-none-any.whl.metadata (21 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pybind11<2.6.2 (from nmslib>=1.7.3.6->scispacy)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl.metadata (8.7 kB)
Downloading scispacy-0.5.4-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━

### library for en_core_sci_sm

In [7]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz


Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.3.0,>=3.2.3 (from en_core_sci_sm==0.5.0)
  Downloading spacy-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (23 kB)
Collecting thinc<8.1.0,>=8.0.12 (from spacy<3.3.0,>=3.2.3->en_core_sci_sm==0.5.0)
  Downloading thinc-8.0.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wasabi<1.1.0,>=0.8.1 (from spacy<3.3.0,>=3.2.3->en_core_sci_sm==0.5.0)
  Downloading wasabi-0.10.1-py3-none-any.whl.metadata (28 kB)
Collecting typer<0.5.0,>=0.3.0 (from spacy<3.3.0,>=3.2.3->en_core_sci_sm==0.5.0)
  Downloading typer-0.4.

### en_ner_bc5dr_md

In [8]:
pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz


Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz (120.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.2/120.2 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: en_ner_bc5cdr_md
  Building wheel for en_ner_bc5cdr_md (setup.py) ... [?25l[?25hdone
  Created wheel for en_ner_bc5cdr_md: filename=en_ner_bc5cdr_md-0.5.0-py3-none-any.whl size=120215835 sha256=2a800ac507210214725307d4498ce4d4018f2451dcad43fd1f49d74f98464d39
  Stored in directory: /root/.cache/pip/wheels/44/e8/99/517b2d53bb44945cf7a96208d44bae722e13f028736a1f1f4f
Successfully built en_ner_bc5cdr_md
Installing collected packages: en_ner_bc5cdr_md
Successfully installed en_ner_bc5cdr_md-0.5.0


In [9]:
import spacy
import scispacy
import en_core_sci_sm
import en_ner_bc5cdr_md
# Load the models
nlp_sci = spacy.load("en_core_sci_sm")
nlp_bc5cdr = spacy.load("en_ner_bc5cdr_md")


In [10]:
!pip install transformers




# Task 3: Programming and Research
### [3.1:](https://)
Using any in-built library present in Python, count the occurrences of the words
in the text (.txt) and give the ‘Top 30’ most common words.
And store the ‘Top 30’ common words and their counts into a CSV file.

In [11]:
from collections import Counter
import re

# Read the combined text from the .txt file extracted from csv files
with open('/content/drive/MyDrive/Assignment2/combined_text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text into words
words = re.findall(r'\b\w+\b', text.lower())

# Count the number of words
word_counts = Counter(words)

# Get the top 30 most common words
top_30_words = word_counts.most_common(30)
# Save the results to a CSV file into directory
import csv

with open('/content/drive/MyDrive/Assignment2/top_30_words.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Word', 'Count'])
    writer.writerows(top_30_words)

print("Top 30 most common words saved to 'top_30_words.csv'.")

Top 30 most common words saved to 'top_30_words.csv'.


### 3.2:
Using the ‘Auto Tokenizer’ function in the ‘Transformers’ library, write a
‘function’ to count unique tokens in the text (.txt) and give the ‘Top 30’ words.


In [12]:
from transformers import AutoTokenizer
from collections import Counter

#Count unique tokens using AutoTokenizer

#Load the AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

#Tokenize the text using the tokenizer
tokens = tokenizer.tokenize(text)

#Count the tokens
token_counts = Counter(tokens)

#Get the top 30 most common tokens
top_30_tokens = token_counts.most_common(30)

#Display the results
print("Top 30 tokens using AutoTokenizer:", top_30_tokens)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (202035206 > 512). Running this sequence through the model will result in indexing errors


Top 30 tokens using AutoTokenizer: [('*', 12214251), ('.', 8947097), ('-', 6203884), (':', 4059494), (',', 3666144), ('[', 2778940), (']', 2778846), ('the', 2672269), ('and', 2360045), (')', 2241033), ('(', 2194265), ('to', 2019939), ('of', 1969060), ('was', 1876789), ('with', 1404979), ('1', 1325148), ('in', 1305933), ('on', 1270509), ('a', 1266928), ('/', 1083591), ('2', 1066626), ('for', 1002471), ('name', 950959), ('##s', 867928), ('no', 846387), ('is', 811070), ('mg', 801453), ('##t', 782649), ('##g', 745859), ('he', 742229)]


# Task 4: Named-Entity Recognition (NER)
### Extract the ‘diseases’, and ‘drugs’ entities in the ‘.txt file’ separately using ‘en_core_sci_sm’/’en_ner_bc5cdr_md’ and biobert. And compare the differences between the two models (Example: Total entities detected by both of them, what’s the difference, check for most common words, and check the difference.)

In [15]:
#Function for split text into smaller chunks
def split_text(text, chunk_size=100000):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Read the combined text file
text_file_path = '/content/drive/MyDrive/Assignment2/combined_text.txt'  # Replace with the actual path
with open(text_file_path, 'r') as f:
    text = f.read()

# Split the text into chunks of manageable size (500,000 characters per chunk)
chunks = split_text(text, chunk_size=500000)


In [None]:
import spacy
import pandas as pd

# Load SciSpaCy models
nlp_sci = spacy.load("en_core_sci_sm")  # General scientific text model
nlp_ner = spacy.load("en_ner_bc5cdr_md")  # Model for detecting diseases and drugs

# Increase the max_length limit to handle long text
nlp_sci.max_length = max([len(chunk) for chunk in chunks]) + 1000
nlp_ner.max_length = max([len(chunk) for chunk in chunks]) + 1000

# Extract diseases and drugs from all chunks
diseases_drugs_sci = []

for chunk in chunks:
    doc_ner_chunk = nlp_ner(chunk)
    diseases_drugs_sci.extend([(ent.text, ent.label_) for ent in doc_ner_chunk.ents if ent.label_ in ['DISEASE', 'CHEMICAL']])

# Save the SciSpaCy results to a CSV
df_entities_sci = pd.DataFrame(diseases_drugs_sci, columns=['Entity', 'Label'])
df_entities_sci.to_csv('/content/drive/MyDrive/Assignment2/spacy_diseases_drugs.csv', index=False)

print("Diseases and drugs extracted using SciSpaCy saved to 'spacy_diseases_drugs.csv'.")


### extract diseases and drugs using the BioBERT model from Hugging Face to detect diseases and drugs.

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import pandas as pd

#Load BioBERT for NER
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

#Create NER pipeline
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

#Extract diseases and drugs using BioBERT from all chunks
bio_entities = []
for chunk in chunks:
    bio_entities.extend(ner(chunk))

#Filter only 'disease' and 'drug' related entities
bio_diseases_drugs = [(entity['word'], entity['entity']) for entity in bio_entities if 'disease' in entity['entity'].lower() or 'drug' in entity['entity'].lower()]

#Save the BioBERT results to a CSV
df_entities_bio = pd.DataFrame(bio_diseases_drugs, columns=['Entity', 'Label'])
df_entities_bio.to_csv('/content/drive/MyDrive/Assignment2/biobert_diseases_drugs.csv', index=False)

print("Diseases and drugs extracted using BioBERT saved to 'biobert_diseases_drugs.csv'.")


### Now we have two sets of results spacy_diseases_drugs.csv (SciSpaCy) and biobert_diseases_drugs.csv (BioBERT), we can compare them. Now The comparison will include following

1.Total entities detected by each model.
2.Common entities detected by both models.
3.Entities unique to each model.
4.Most common entities detected by both models and analysis of differences.



In [None]:
# Load both sets of results
df_sci = pd.read_csv('/content/drive/MyDrive/Assignment2/spacy_diseases_drugs.csv')
df_bio = pd.read_csv('/content/drive/MyDrive/Assignment2/biobert_diseases_drugs.csv')

#Convert entity lists to sets for comparison
set_sci = set(df_sci['Entity'])
set_bio = set(df_bio['Entity'])

#Total entities detected
total_sci = len(set_sci)
total_bio = len(set_bio)

#Common entities
common_entities = set_sci.intersection(set_bio)

#Unique entities
unique_sci = set_sci - set_bio
unique_bio = set_bio - set_sci

#Display comparison results
print(f"Total entities detected by SciSpaCy: {total_sci}")
print(f"Total entities detected by BioBERT: {total_bio}")
print(f"Number of common entities: {len(common_entities)}")

#Save the comparison results to CSV files
pd.DataFrame(list(common_entities), columns=['Common Entities']).to_csv('/content/drive/MyDrive/Assignment2/common_entities.csv', index=False)
pd.DataFrame(list(unique_sci), columns=['Unique SciSpaCy Entities']).to_csv('/content/drive/MyDrive/Assignment2/unique_sci_entities.csv', index=False)
pd.DataFrame(list(unique_bio), columns=['Unique BioBERT Entities']).to_csv('/content/drive/MyDrive/Assignment2/unique_bio_entities.csv', index=False)

print("Comparison results saved: 'common_entities.csv', 'unique_sci_entities.csv', 'unique_bio_entities.csv'.")
