In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
train = pd.read_csv('train_fil.csv')
test = pd.read_csv('test_fil.csv')
val = pd.read_csv('val_fil.csv')

In [6]:
train.head(5)

Unnamed: 0.1,Unnamed: 0,text,source,prompt_id,text_length,word_count
0,57594,The food is always hot and made fresh. I prefe...,Flan-T5-XL,0,169,34
1,343858,Seriously the slowest service you could ever h...,Human,0,331,63
2,462221,This reaction is favored at low pressures but ...,Human,0,610,98
3,100762,"Justin had owned his car for over five years, ...",GPT-3.5,0,550,109
4,639192,I got this. One I think you are mistaken it is...,OPT-2.7B,0,193,36


In [11]:
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model
import torch

# Assuming CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load models with GPU support
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2Model.from_pretrained('gpt2').to(device)

# Ensure the model is in evaluation mode
bert_model.eval()
gpt2_model.eval()

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [15]:
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input tensors to the GPU
    with torch.no_grad():  # Temporarily set all the requires_grad flags to false
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()  # Move embeddings back to CPU

def get_gpt2_embeddings(text):
    inputs = gpt2_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input tensors to the GPU
    with torch.no_grad():  # Temporarily set all the requires_grad flags to false
        outputs = gpt2_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()  # Move embeddings back to CPU


In [14]:
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

In [16]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,text,source,prompt_id,text_length,word_count,bert_embeddings
0,57594,The food is always hot and made fresh. I prefe...,Flan-T5-XL,0,169,34,"[[0.1458053, 0.018536663, 0.25950676, 0.172973..."
1,343858,Seriously the slowest service you could ever h...,Human,0,331,63,"[[0.20390975, 0.0711168, 0.18746778, 0.0818906..."
2,462221,This reaction is favored at low pressures but ...,Human,0,610,98,"[[-0.46987852, 0.124872394, 0.20436251, -0.054..."


In [17]:
from tqdm import tqdm
#df = train.copy()
# Apply the functions to the DataFrame
#df['bert_embeddings'] = [get_bert_embeddings(text) for text in tqdm(df['text'], desc="Generating BERT Embeddings")]
df['gpt2_embeddings'] = [get_gpt2_embeddings(text) for text in tqdm(df['text'], desc="Generating GPT-2 Embeddings")]


Generating GPT-2 Embeddings: 100%|██████████████████████████████████████████████████████████████████████| 253255/253255 [32:50<00:00, 128.52it/s]


In [18]:
df_1 = test.copy()
# Apply the functions to the DataFrame
df_1['bert_embeddings'] = [get_bert_embeddings(text) for text in tqdm(df_1['text'], desc="Generating BERT Embeddings")]
df_1['gpt2_embeddings'] = [get_gpt2_embeddings(text) for text in tqdm(df_1['text'], desc="Generating GPT-2 Embeddings")]



Generating BERT Embeddings: 100%|█████████████████████████████████████████████████████████████████████████| 79143/79143 [10:34<00:00, 124.75it/s]
Generating GPT-2 Embeddings: 100%|████████████████████████████████████████████████████████████████████████| 79143/79143 [10:14<00:00, 128.77it/s]


In [19]:
df_2 = val.copy()
# Apply the functions to the DataFrame
df_2['bert_embeddings'] = [get_bert_embeddings(text) for text in tqdm(df_2['text'], desc="Generating BERT Embeddings")]
df_2['gpt2_embeddings'] = [get_gpt2_embeddings(text) for text in tqdm(df_2['text'], desc="Generating GPT-2 Embeddings")]



Generating BERT Embeddings: 100%|█████████████████████████████████████████████████████████████████████████| 63314/63314 [08:30<00:00, 123.98it/s]
Generating GPT-2 Embeddings: 100%|████████████████████████████████████████████████████████████████████████| 63314/63314 [08:12<00:00, 128.55it/s]


In [25]:
def save_as_hdf5(df, name):
    # Save the entire DataFrame as an HDF5 file
    df.to_hdf(f'{name}.h5', key='data', mode='w')

save_as_hdf5(df, 'train_embeddings')
save_as_hdf5(df_1, 'test_embeddings')
save_as_hdf5(df_2, 'val_embeddings')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['text', 'source', 'bert_embeddings', 'gpt2_embeddings'], dtype='object')]

  df.to_hdf(f'{name}.h5', key='data', mode='w')


In [24]:
!pip install tables

[33mDEPRECATION: Loading egg at /home/sadibha2/.conda/envs/localization/lib/python3.12/site-packages/MultiScaleDeformableAttention-1.0-py3.12-linux-x86_64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting tables
  Downloading tables-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting numexpr>=2.6.2 (from tables)
  Downloading numexpr-2.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting py-cpuinfo (from tables)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting blosc2>=2.3.0 (from tables)
  Downloading blosc2-2.6.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting ndindex>=1.4 (from blosc2>=2.3.0->tables)
  Downloading ndindex-1.8-py3-none-any.whl.metadata (3.4 kB)
Colle