# Install Required libraries

In [1]:
!pip install sacremoses transformers

Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sa

# Import required libraries

In [4]:
from transformers import BioGptTokenizer, BioGptForSequenceClassification, pipeline, AutoModel
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import Data

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dissertation/data/GOUT-CC-2020-CORPUS-REDACTED.csv')
#df = df.drop("Predict", axis=1)

In [8]:
# Check for consensus and mark rows for removal
rows_to_drop = []
for index, row in df.iterrows():
    if row['Consensus'] == '-': #Drop unknown and unmarked
        rows_to_drop.append(index)

# Drop the marked rows
df.drop(rows_to_drop, inplace=True)

# Print the updated DataFrame
print(df)

                                        Chief Complaint Predict Consensus
2     Altercation while making arrest, c/o R hand pa...       N         N
3     Cut on L upper thigh wtih saw. Bleeding contro...       N         N
7     Generalized pain all over. Patient not answeri...       N         N
13    R knee pain and swelling x2 weeks. "I've just ...       U         N
14    R wrist pain/swelling/numbness since this morn...       U         N
...                                                 ...     ...       ...
8124  sob and right chest pain x 1 weeks  - hx of mu...       N         N
8125  starts in lower back and goes right  legs x1 m...       N         N
8127  sciatica pain R lower back radiating to R groi...       N         N
8132  stepped on a nail at home with right foot, pai...       N         N
8136  Rash/sores across body, infection ro left thum...       N         N

[264 rows x 3 columns]


# Small data analysis

In [9]:
print("group by consensus")
print(df.groupby("Consensus").size())
print("-----------------------------------------------------------------")
print("Predict is U")
print(df[df['Predict'] == 'U'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is N")
print(df[df['Predict'] == 'N'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is Y")
print(df[df['Predict'] == 'Y'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is -")
print(df[df['Predict'] == '-'].groupby('Consensus').size())
print("-----------------------------------------------------------------")


group by consensus
Consensus
N    232
U      7
Y     25
dtype: int64
-----------------------------------------------------------------
Predict is U
Consensus
N    111
U      7
Y      9
dtype: int64
-----------------------------------------------------------------
Predict is N
Consensus
N    116
Y      7
dtype: int64
-----------------------------------------------------------------
Predict is Y
Consensus
N    5
Y    9
dtype: int64
-----------------------------------------------------------------
Predict is -
Series([], dtype: int64)
-----------------------------------------------------------------


# Load Model and tokenizer (BioGPT)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. Remember this
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
tokenizer.padding_side = "left"

model = AutoModel.from_pretrained("microsoft/biogpt")

model = model.to(device)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/927k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/696k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

In [None]:
#oversample 2019 and get result
#class weight
#fine tune

In [None]:
#clear GPU cache
#torch.cuda.empty_cache()

## Define the pipeline

In [10]:
# Create the pipeline
p = pipeline(
    task="feature-extraction",
    tokenizer="microsoft/biogpt",
    model="microsoft/biogpt",
    framework="pt",
    device=0,  # use CUDA with 0
)

### Get the embeddings of the last token of the last hidden state

In [11]:
# Custom function to extract the embedding of the last token
def extract_last_token(last_hidden_states):
    last_hidden_states = np.array(last_hidden_states)
    return last_hidden_states[:,-1,:]

# Process the data using the pipeline
results = p([row["Chief Complaint"] for _, row in df.iterrows()])

# Extract the last token of the last hidden state
embeddings = [extract_last_token(hidden_state) for hidden_state in results]

In [12]:
# Assign the reshaped embeddings to the "embeddings" column in the DataFrame
df["emb_biogpt_no_FineT"] = embeddings

# Print the resulting DataFrame
print(df)

                                        Chief Complaint Predict Consensus  \
2     Altercation while making arrest, c/o R hand pa...       N         N   
3     Cut on L upper thigh wtih saw. Bleeding contro...       N         N   
7     Generalized pain all over. Patient not answeri...       N         N   
13    R knee pain and swelling x2 weeks. "I've just ...       U         N   
14    R wrist pain/swelling/numbness since this morn...       U         N   
...                                                 ...     ...       ...   
8124  sob and right chest pain x 1 weeks  - hx of mu...       N         N   
8125  starts in lower back and goes right  legs x1 m...       N         N   
8127  sciatica pain R lower back radiating to R groi...       N         N   
8132  stepped on a nail at home with right foot, pai...       N         N   
8136  Rash/sores across body, infection ro left thum...       N         N   

                                    emb_biogpt_no_FineT  
2     [[0.0141769

## Save to csv ot txt

In [13]:
df.to_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/2020_noFT.json', orient='records')