# Install Required libraries

In [None]:
!pip install sacremoses transformers sentencepiece

Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/880.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m109.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

# Import required libraries

In [None]:
from transformers import LlamaTokenizer, LlamaModel, pipeline
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import Data

In [None]:
df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/2020_noFT.json')
#df = df.drop("Predict", axis=1)
#df2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dissertation/data/GOUT-CC-2020-CORPUS-REDACTED.csv')
#df2 = df2.drop("Predict", axis=1)

#df = pd.concat([df, df2], ignore_index=True)

In [None]:
df.head()

Unnamed: 0,Chief Complaint,Predict,Consensus,emb_biogpt_no_FineT,emb_Stanford_no_FineT
0,"""been feeling bad"" last 2 weeks & switched BP ...",N,-,"[[0.4998164773, -1.1454148293, -1.752535819999...","[[-2.0845830441, 2.0618994236, -0.5788192153, ..."
1,"""can't walk"", reports onset at <<TIME>>. orien...",Y,N,"[[-0.9474165440000001, -0.1887536049, -0.06674...","[[0.5856236815, 1.2349587679, -0.2313203514, -..."
2,"""dehydration"" Chest hurts, hips hurt, cramps P...",Y,Y,"[[-0.7398123741, 0.8122399449000001, -0.097419...","[[-1.4779242277, 2.5207219124, -2.1727879047, ..."
3,"""gout flare up"" L arm swelling x 1 week. denie...",Y,Y,"[[-0.3572742343, 0.6445433497, 1.5928075314, -...","[[-0.8846961856000001, -1.709597826, -1.948533..."
4,"""heart racing,""dyspnea, and orthopnea that has...",N,-,"[[0.0717470348, 1.4709169865, -1.7467554808, -...","[[0.5032715797, 0.2582928836, 2.9334533215, -1..."


In [None]:
# Check for consensus and mark rows for removal
rows_to_drop = []
for index, row in df.iterrows():
    if row['Consensus'] == '-': #Drop unknown and unmarked
        rows_to_drop.append(index)

# Drop the marked rows
df.drop(rows_to_drop, inplace=True)

# Print the updated DataFrame
print(df)

                                        Chief Complaint Consensus
1     "can't walk", reports onset at <<TIME>>. orien...         N
2     "dehydration" Chest hurts, hips hurt, cramps P...         Y
3     "gout flare up" L arm swelling x 1 week. denie...         Y
5     "I started breathing hard"  hx- htn, gout, anx...         N
6     "I think I have a gout flare up" L wrist pain ...         Y
...                                                 ...       ...
8424  sob and right chest pain x 1 weeks  - hx of mu...         N
8425  starts in lower back and goes right  legs x1 m...         N
8427  sciatica pain R lower back radiating to R groi...         N
8432  stepped on a nail at home with right foot, pai...         N
8436  Rash/sores across body, infection ro left thum...         N

[445 rows x 2 columns]


# Small data analysis

# Load Model and tokenizer (BioGPT)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. Remember this
tokenizer = LlamaTokenizer.from_pretrained("chaoyi-wu/PMC_LLAMA_7B")
tokenizer.padding_side = "left"

model = LlamaModel.from_pretrained("chaoyi-wu/PMC_LLAMA_7B")

model = model.to(device)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.88G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.89G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/7.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at chaoyi-wu/PMC_LLAMA_7B were not used when initializing LlamaModel: ['lm_head.weight']
- This IS expected if you are initializing LlamaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Define the pipeline

In [None]:
# Create the pipeline
p = pipeline(
    task="feature-extraction",
    tokenizer=tokenizer,
    model=model,
    framework="pt",
    device=0,  # use CUDA with 0
)

### Get the embeddings of the last token of the last hidden state

In [None]:
# Custom function to extract the embedding of the last token
def extract_last_token(last_hidden_states):
    last_hidden_states = np.array(last_hidden_states)
    return last_hidden_states[:,-1,:]

# Process the data using the pipeline
results = p([row["Chief Complaint"] for _, row in df.iterrows()])

# Extract the last token of the last hidden state
embeddings = [extract_last_token(last_hidden_states) for last_hidden_states in results]

In [None]:
# Assign the reshaped embeddings to the "embeddings" column in the DataFrame
df["emb_LLaMA_no_FineT"] = embeddings

# Print the resulting DataFrame
print(df)

                                       Chief Complaint Predict Consensus  \
0    "been feeling bad" last 2 weeks & switched BP ...       N         -   
1    "can't walk", reports onset at <<TIME>>. orien...       Y         N   
2    "dehydration" Chest hurts, hips hurt, cramps P...       Y         Y   
3    "gout flare up" L arm swelling x 1 week. denie...       Y         Y   
4    "heart racing,"dyspnea, and orthopnea that has...       N         -   
..                                                 ...     ...       ...   
295  upper abd/R side chest pain x1 month, new onse...       N         N   
296  upper lip swelling x one day, pmh HTN, COPD, b...       N         N   
297  walked outside of a gas station and began bein...       N         -   
298  was getting prepped for colonoscopy and was se...       N         N   
299  Was seen at <<HOSPITAL>> after an MVC. Pt stat...       N         N   

                                   emb_biogpt_no_FineT  \
0    [[0.4998164773, -1.14541

## Save to csv ot txt

In [None]:
df.to_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/2020_noFT.json', orient='records')