# Install Required libraries (not finished changing names of models and tokenizer)

In [8]:
!pip install sacremoses transformers peft

Collecting peft
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate (from peft)
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.21.0 peft-0.4.0


# Import required libraries

In [5]:
from transformers import GPT2Tokenizer, AutoModel, pipeline
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import Data

In [6]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dissertation/data/GOUT-CC-2019-CORPUS-REDACTED.csv')

In [7]:
# Check for consensus and mark rows for removal
rows_to_drop = []
for index, row in df.iterrows():
    if row['Consensus'] == '-': #Drop unknown and unmarked
        rows_to_drop.append(index)

# Drop the marked rows
df.drop(rows_to_drop, inplace=True)

# Print the updated DataFrame
print(df)

                                       Chief Complaint Predict Consensus
1    "can't walk", reports onset at <<TIME>>. orien...       Y         N
2    "dehydration" Chest hurts, hips hurt, cramps P...       Y         Y
3    "gout flare up" L arm swelling x 1 week. denie...       Y         Y
5    "I started breathing hard"  hx- htn, gout, anx...       N         N
6    "I think I have a gout flare up" L wrist pain ...       Y         Y
..                                                 ...     ...       ...
294  unwitnessed seizure last night, no dialysis in...       N         N
295  upper abd/R side chest pain x1 month, new onse...       N         N
296  upper lip swelling x one day, pmh HTN, COPD, b...       N         N
298  was getting prepped for colonoscopy and was se...       N         N
299  Was seen at <<HOSPITAL>> after an MVC. Pt stat...       N         N

[197 rows x 3 columns]


# Small data analysis

In [None]:
df.groupby("Consensus").size()
print("Predict is U")
print(df[df['Predict'] == 'U'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is N")
print(df[df['Predict'] == 'N'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is Y")
print(df[df['Predict'] == 'Y'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is -")
print(df[df['Predict'] == '-'].groupby('Consensus').size())
print("-----------------------------------------------------------------")


Predict is U
Consensus
-      2
N    127
U     12
Y     15
dtype: int64
-----------------------------------------------------------------
Predict is N
Consensus
-    7955
N     201
U       4
Y       8
dtype: int64
-----------------------------------------------------------------
Predict is Y
Consensus
-    17
N    22
Y    72
dtype: int64
-----------------------------------------------------------------
Predict is -
Consensus
-    2
dtype: int64
-----------------------------------------------------------------


# Load Model and tokenizer (BioGPT)

In [10]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    prepare_model_for_int8_training,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PeftConfig,
    PeftModel,

)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


peft_model_id = "Lukee4/biomedlm-2019"
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModel.from_pretrained('stanford-crfm/BioMedLM')

tokenizer = GPT2Tokenizer.from_pretrained('stanford-crfm/BioMedLM')

# Load the Lora model
#inference_model = PeftModel.from_pretrained(inference_model, peft_model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/876 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/602k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/276k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

In [11]:
inference_model = PeftModel.from_pretrained(inference_model, peft_model_id)



Downloading adapter_model.bin:   0%|          | 0.00/10.5M [00:00<?, ?B/s]

In [None]:
#clear GPU cache
#torch.cuda.empty_cache()

## Define the pipeline

In [None]:
# Create the pipeline
p = pipeline(
    task="feature-extraction",
    tokenizer="stanford-crfm/BioMedLM",
    model="stanford-crfm/BioMedLM",
    framework="pt",
    device=0,  # use CUDA with 0
)

Some weights of the model checkpoint at stanford-crfm/BioMedLM were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Get the embeddings of the last token of the last hidden state

In [None]:
def extract_last_token(last_hidden_states):
    last_hidden_states = np.array(last_hidden_states)
    return last_hidden_states[:,-1,:]

# Process the data using the pipeline
results = p([row["Chief Complaint"] for _, row in df.iterrows()])

# Extract the last token of the last hidden state
embeddings = [extract_last_token(last_hidden_state ) for last_hidden_state  in results]

In [None]:
# Assign the reshaped embeddings to the "embeddings" column in the DataFrame
df["emb_Stanford_no_FineT"] = embeddings

# Print the resulting DataFrame
print(df)

                                       Chief Complaint Predict Consensus  \
0    "been feeling bad" last 2 weeks & switched BP ...       N         -   
1    "can't walk", reports onset at <<TIME>>. orien...       Y         N   
2    "dehydration" Chest hurts, hips hurt, cramps P...       Y         Y   
3    "gout flare up" L arm swelling x 1 week. denie...       Y         Y   
4    "heart racing,"dyspnea, and orthopnea that has...       N         -   
..                                                 ...     ...       ...   
295  upper abd/R side chest pain x1 month, new onse...       N         N   
296  upper lip swelling x one day, pmh HTN, COPD, b...       N         N   
297  walked outside of a gas station and began bein...       N         -   
298  was getting prepped for colonoscopy and was se...       N         N   
299  Was seen at <<HOSPITAL>> after an MVC. Pt stat...       N         N   

                                   emb_biogpt_no_FineT  \
0    [[0.4998164773, -1.14541

## Save to csv ot txt

In [None]:
df.to_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/datafinal.json', orient='records')
#np.savetxt(r"/content/drive/MyDrive/Colab Notebooks/dissertation/data/GOUT_with_emb_19/20.txt", df, fmt='%s')

# Normalize features https://discuss.huggingface.co/t/finetuning-for-feature-extraction-i-e-unsupervised-fine-tuning/12595/*4*

## Extract the embeddings using mean pooling

In [None]:
def mean_pooling(last_hidden_states, ):
    last_4_layers = last_hidden_states[-12:]  # Consider the last 4 layers
    return np.mean(last_4_layers, axis=1)

# Process the data using the pipeline
results = p([row["text"] for _, row in df2.iterrows()])

features = np.squeeze(results)

print(features.shape)
# Perform mean pooling on the last hidden states
embeddings = [mean_pooling(hidden_state) for hidden_state in results]

# Create a DataFrame to store the results
df2["embeddings12"] = embeddings

# Print the resulting DataFrame
print(df2)