# Install Required libraries

In [5]:
!pip install sacremoses transformers peft

Collecting peft
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m906.6 kB/s[0m eta [36m0:00:00[0m
Collecting accelerate (from peft)
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.21.0 peft-0.4.0


# Import required libraries

In [2]:
from transformers import BioGptTokenizer, BioGptForSequenceClassification, pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import Data

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dissertation/data/final_clinicalText.csv')

# Load Model and tokenizer (BioGPT)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. Remember this
tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
tokenizer.padding_side = "left"

model = AutoModel.from_pretrained("microsoft/biogpt", num_labels=2, return_dict=True)
#model.resize_token_embeddings(len(tokenizer))

#model = model.to(device)

In [6]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    prepare_model_for_int8_training,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PeftConfig,
    PeftModel,

)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


peft_model_id = "Lukee4/biogpt-letters"
config = PeftConfig.from_pretrained(peft_model_id)

#config.task_type='FEATURE_EXTRACTION'
inference_model = AutoModel.from_pretrained(config.base_model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
inference_model = PeftModel.from_pretrained(inference_model, peft_model_id)

#model = model.to(device)

Downloading (…)/adapter_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/927k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/696k [00:00<?, ?B/s]

Downloading adapter_model.bin:   0%|          | 0.00/4.78M [00:00<?, ?B/s]

In [None]:
#oversample 2019 and get result
#class weight
#fine tune

## Define the pipeline

In [None]:
# Create the pipeline
p = pipeline(
    task="feature-extraction",
    tokenizer=tokenizer,
    model=model,
    framework="pt",
    device=0,  # use CUDA with 0
)

### Get the embeddings of the last token of the last hidden state

In [None]:
# Custom function to extract the embedding of the last token
def extract_last_token(last_hidden_states):
    last_hidden_states = np.array(last_hidden_states)
    return last_hidden_states[:,-1,:]

# Process the data using the pipeline
results = p([row["extracted_text"] for _, row in df.iterrows()])

# Extract the last token of the last hidden state
embeddings = [extract_last_token(hidden_state) for hidden_state in results]

In [None]:
# Assign the reshaped embeddings to the "embeddings" column in the DataFrame
df["emb_biogpt_no_FineT"] = embeddings

# Print the resulting DataFrame
print(df)

    clinical_action                                     extracted_text  \
0                 1  ATTENDANCE STATUS Did Not Attend - no advance ...   
1                 0  Your patient has been informed,  along with an...   
2                 0  Your patient has been informed,  along with an...   
3                 1  ATTENDANCE STATUS Did Not Attend - no advance ...   
4                 1  ATTENDANCE STATUS Did Not Attend - no advance ...   
..              ...                                                ...   
94                0  ATTENDANCE STATUS Attended on time or, if late...   
95                0  ATTENDANCE STATUS Attended on time or, if late...   
96                1  ATTENDANCE STATUS Attended on time or, if late...   
97                0  ATTENDANCE STATUS Attended on time or, if late...   
98                1  ATTENDANCE STATUS Did Not Attend - no advance ...   

                                  emb_biogpt_no_FineT  
0   [[-0.6997142434120178, 0.3576340675354004, -1....  

## Save to csv ot txt

In [None]:
df.to_json('/content/drive/MyDrive/Colab Notebooks/dissertation/data/letters_noFT.json', orient='records')

#df.to_csv("/content/drive/MyDrive/Colab Notebooks/dissertation/data/datafinal.csv", index=False)
#np.savetxt(r"/content/drive/MyDrive/Colab Notebooks/dissertation/data/datafinal.txt", df, fmt='%s')

#For testing pusposes (SVC code)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.emb_biogpt_no_FineT, df.Predict, test_size=0.2, random_state=42)

In [None]:
X_train = np.array(X_train.tolist())
X_train = np.squeeze(X_train)
X_test = np.array(X_test.tolist())
X_test = np.squeeze(X_test)
print(np.array(X_train).shape)

(150, 1024)


In [None]:
X_train

In [None]:
from sklearn.svm import SVC
svm_model = SVC(decision_function_shape='ovo')
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           N       0.61      0.78      0.68        18
           U       0.00      0.00      0.00         3
           Y       0.60      0.53      0.56        17

    accuracy                           0.61        38
   macro avg       0.40      0.44      0.42        38
weighted avg       0.56      0.61      0.58        38



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
c = pipeline(
    task="text-classification",
    tokenizer="microsoft/biogpt",
    model=model,
    framework="pt",
    device=0,  # use CUDA with 0
)


Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
results2 = c([row["Chief Complaint"] for _, row in df.iterrows()])


In [None]:
print(classification_report(, results2))

[{'label': 'LABEL_0', 'score': 0.6615974307060242}, {'label': 'LABEL_0', 'score': 0.8978613615036011}, {'label': 'LABEL_0', 'score': 0.737217903137207}, {'label': 'LABEL_1', 'score': 0.5124248266220093}, {'label': 'LABEL_0', 'score': 0.7070297598838806}, {'label': 'LABEL_0', 'score': 0.6875593066215515}, {'label': 'LABEL_0', 'score': 0.6362900137901306}, {'label': 'LABEL_1', 'score': 0.5346298813819885}, {'label': 'LABEL_0', 'score': 0.7779340147972107}, {'label': 'LABEL_0', 'score': 0.6526575088500977}, {'label': 'LABEL_0', 'score': 0.7227324843406677}, {'label': 'LABEL_0', 'score': 0.8791565895080566}, {'label': 'LABEL_1', 'score': 0.932992696762085}, {'label': 'LABEL_0', 'score': 0.6692107915878296}, {'label': 'LABEL_0', 'score': 0.5934761166572571}, {'label': 'LABEL_0', 'score': 0.5123370885848999}, {'label': 'LABEL_0', 'score': 0.7277477383613586}, {'label': 'LABEL_0', 'score': 0.9222954511642456}, {'label': 'LABEL_0', 'score': 0.6491223573684692}, {'label': 'LABEL_0', 'score': 0.

# Normalize features https://discuss.huggingface.co/t/finetuning-for-feature-extraction-i-e-unsupervised-fine-tuning/12595/*4*

## Extract the embeddings using mean pooling

In [None]:
def mean_pooling(last_hidden_states, ):
    last_4_layers = last_hidden_states[-12:]  # Consider the last 4 layers
    return np.mean(last_4_layers, axis=1)

# Process the data using the pipeline
results = p([row["text"] for _, row in df2.iterrows()])

features = np.squeeze(results)

print(features.shape)
# Perform mean pooling on the last hidden states
embeddings = [mean_pooling(hidden_state) for hidden_state in results]

# Create a DataFrame to store the results
df2["embeddings12"] = embeddings

# Print the resulting DataFrame
print(df2)