# Install Required libraries

In [None]:
!pip install sacremoses transformers peft
! pip install -U transformers

Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-ma

# Import required libraries

In [None]:
from transformers import BioGptTokenizer, AutoModelForSequenceClassification, pipeline, AutoModel, AutoTokenizer
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dissertation/data/GOUT-CC-2019-CORPUS-REDACTED.csv')
#df = df.drop("Predict", axis=1)
#df2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dissertation/data/GOUT-CC-2020-CORPUS-REDACTED.csv')
#df2 = df2.drop("Predict", axis=1)


#df = pd.concat([df, df2], ignore_index=True)

In [None]:
# Check for consensus and mark rows for removal
rows_to_drop = []
for index, row in df.iterrows():
    if row['Consensus'] == '-' or row['Consensus'] == 'U': #Drop unknown and unmarked
        rows_to_drop.append(index)

# Drop the marked rows
df.drop(rows_to_drop, inplace=True)

# Print the updated DataFrame
print(df)

                                       Chief Complaint Predict Consensus  \
1    "can't walk", reports onset at <<TIME>>. orien...       Y         N   
2    "dehydration" Chest hurts, hips hurt, cramps P...       Y         Y   
3    "gout flare up" L arm swelling x 1 week. denie...       Y         Y   
5    "I started breathing hard"  hx- htn, gout, anx...       N         N   
6    "I think I have a gout flare up" L wrist pain ...       Y         Y   
..                                                 ...     ...       ...   
294  unwitnessed seizure last night, no dialysis in...       N         N   
295  upper abd/R side chest pain x1 month, new onse...       N         N   
296  upper lip swelling x one day, pmh HTN, COPD, b...       N         N   
298  was getting prepped for colonoscopy and was se...       N         N   
299  Was seen at <<HOSPITAL>> after an MVC. Pt stat...       N         N   

                                   emb_biogpt_no_FineT  
1    [[-0.9474165439605713, -0

# Small data analysis

In [None]:
print("group by consensus")
print(df.groupby("Consensus").size())
print("-----------------------------------------------------------------")
print("Predict is U")
print(df[df['Predict'] == 'U'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is N")
print(df[df['Predict'] == 'N'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is Y")
print(df[df['Predict'] == 'Y'].groupby('Consensus').size())
print("-----------------------------------------------------------------")
print("Predict is -")
print(df[df['Predict'] == '-'].groupby('Consensus').size())
print("-----------------------------------------------------------------")


group by consensus
Consensus
-    103
N    118
U      9
Y     70
dtype: int64
-----------------------------------------------------------------
Predict is U
Consensus
N    16
U     5
Y     6
dtype: int64
-----------------------------------------------------------------
Predict is N
Consensus
-    86
N    85
U     4
Y     1
dtype: int64
-----------------------------------------------------------------
Predict is Y
Consensus
-    17
N    17
Y    63
dtype: int64
-----------------------------------------------------------------
Predict is -
Series([], dtype: int64)
-----------------------------------------------------------------


# Load Model and tokenizer (BioGPT)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. Remember this
tokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
tokenizer.padding_side = "left"

model = BioGptForSequenceClassification.from_pretrained("microsoft/biogpt")

model = model.to(device)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/927k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/696k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/biogpt were not used when initializing BioGptForSequenceClassification: ['output_projection.weight']
- This IS expected if you are initializing BioGptForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BioGptForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BioGptForSequenceClassification were not initialized from the model checkpoint at microsoft/biogpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#oversample 2019 and get result
#class weight
#fine tune

In [None]:
#clear GPU cache
#torch.cuda.empty_cache()

## Define the pipeline

In [None]:
# Create the pipeline
p = pipeline(
    task="feature-extraction",
    tokenizer="microsoft/biogpt",
    model="microsoft/biogpt",
    framework="pt",
    device=0,  # use CUDA with 0
)

Some weights of the model checkpoint at microsoft/biogpt were not used when initializing BioGptModel: ['output_projection.weight']
- This IS expected if you are initializing BioGptModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BioGptModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Get the embeddings of the last token of the last hidden state

In [None]:
# Custom function to extract the embedding of the last token
def extract_last_token(last_hidden_states):
    last_hidden_states = np.array(last_hidden_states)
    return last_hidden_states[:,-1,:]

# Process the data using the pipeline
results = p([row["Chief Complaint"] for _, row in df.iterrows()])

# Extract the last token of the last hidden state
embeddings = [extract_last_token(hidden_state) for hidden_state in results]

In [None]:
# Assign the reshaped embeddings to the "embeddings" column in the DataFrame
df["emb_biogpt_no_FineT"] = embeddings

# Print the resulting DataFrame
print(df)

                                       Chief Complaint Predict Consensus  \
0    "been feeling bad" last 2 weeks & switched BP ...       N         -   
1    "can't walk", reports onset at <<TIME>>. orien...       Y         N   
2    "dehydration" Chest hurts, hips hurt, cramps P...       Y         Y   
3    "gout flare up" L arm swelling x 1 week. denie...       Y         Y   
4    "heart racing,"dyspnea, and orthopnea that has...       N         -   
..                                                 ...     ...       ...   
295  upper abd/R side chest pain x1 month, new onse...       N         N   
296  upper lip swelling x one day, pmh HTN, COPD, b...       N         N   
297  walked outside of a gas station and began bein...       N         -   
298  was getting prepped for colonoscopy and was se...       N         N   
299  Was seen at <<HOSPITAL>> after an MVC. Pt stat...       N         N   

                                   emb_biogpt_no_FineT  
0    [[0.4998164772987366, -1.

## Save to csv ot txt

In [None]:

df.to_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/datafinal.json', orient='records')

#df.to_csv("/content/drive/MyDrive/Colab Notebooks/dissertation/data/datafinal.csv", index=False)
#np.savetxt(r"/content/drive/MyDrive/Colab Notebooks/dissertation/data/datafinal.txt", df, fmt='%s')

#For testing pusposes (SVC code)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           N       0.61      0.78      0.68        18
           U       0.00      0.00      0.00         3
           Y       0.60      0.53      0.56        17

    accuracy                           0.61        38
   macro avg       0.40      0.44      0.42        38
weighted avg       0.56      0.61      0.58        38



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
c = pipeline(
    task="text-classification",
    tokenizer="microsoft/biogpt",
    model=model,
    framework="pt",
    device=0,  # use CUDA with 0
)


Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
results2 = c([row["Chief Complaint"] for _, row in df.iterrows()])


In [None]:
print(classification_report(, results2))

[{'label': 'LABEL_0', 'score': 0.6615974307060242}, {'label': 'LABEL_0', 'score': 0.8978613615036011}, {'label': 'LABEL_0', 'score': 0.737217903137207}, {'label': 'LABEL_1', 'score': 0.5124248266220093}, {'label': 'LABEL_0', 'score': 0.7070297598838806}, {'label': 'LABEL_0', 'score': 0.6875593066215515}, {'label': 'LABEL_0', 'score': 0.6362900137901306}, {'label': 'LABEL_1', 'score': 0.5346298813819885}, {'label': 'LABEL_0', 'score': 0.7779340147972107}, {'label': 'LABEL_0', 'score': 0.6526575088500977}, {'label': 'LABEL_0', 'score': 0.7227324843406677}, {'label': 'LABEL_0', 'score': 0.8791565895080566}, {'label': 'LABEL_1', 'score': 0.932992696762085}, {'label': 'LABEL_0', 'score': 0.6692107915878296}, {'label': 'LABEL_0', 'score': 0.5934761166572571}, {'label': 'LABEL_0', 'score': 0.5123370885848999}, {'label': 'LABEL_0', 'score': 0.7277477383613586}, {'label': 'LABEL_0', 'score': 0.9222954511642456}, {'label': 'LABEL_0', 'score': 0.6491223573684692}, {'label': 'LABEL_0', 'score': 0.

In [None]:
def mean_pooling(last_hidden_states, ):
    last_4_layers = last_hidden_states[-12:]  # Consider the last 4 layers
    return np.mean(last_4_layers, axis=1)

# Process the data using the pipeline
results = p([row["text"] for _, row in df2.iterrows()])

features = np.squeeze(results)

print(features.shape)
# Perform mean pooling on the last hidden states
embeddings = [mean_pooling(hidden_state) for hidden_state in results]

# Create a DataFrame to store the results
df2["embeddings12"] = embeddings

# Print the resulting DataFrame
print(df2)