In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('train_fil.csv')
test = pd.read_csv('test_fil.csv')
val = pd.read_csv('val_fil.csv')

In [None]:
train.head(5)

In [None]:
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model
import torch

# Assuming CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load models with GPU support
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2Model.from_pretrained('gpt2').to(device)

# Ensure the model is in evaluation mode
bert_model.eval()
gpt2_model.eval()

In [None]:
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input tensors to the GPU
    with torch.no_grad():  # Temporarily set all the requires_grad flags to false
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()  # Move embeddings back to CPU

gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

def get_gpt2_embeddings(text):
    inputs = gpt2_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input tensors to the GPU
    with torch.no_grad():  # Temporarily set all the requires_grad flags to false
        outputs = gpt2_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()  # Move embeddings back to CPU


In [None]:
from tqdm import tqdm
# df = train.copy()
# Apply the functions to the DataFrame
df['bert_embeddings'] = [get_bert_embeddings(text) for text in tqdm(df['text'], desc="Generating BERT Embeddings")]
# df['gpt2_embeddings'] = [get_gpt2_embeddings(text) for text in tqdm(df['text'], desc="Generating GPT-2 Embeddings")]

In [None]:
from tqdm import tqdm
# df = train.copy()
# Apply the functions to the DataFrame
# df['bert_embeddings'] = [get_bert_embeddings(text) for text in tqdm(df['text'], desc="Generating BERT Embeddings")]
df['gpt2_embeddings'] = [get_gpt2_embeddings(text) for text in tqdm(df['text'], desc="Generating GPT-2 Embeddings")]

In [None]:
df_1 = test.copy()
# Apply the functions to the DataFrame
df_1['bert_embeddings'] = [get_bert_embeddings(text) for text in tqdm(df_1['text'], desc="Generating BERT Embeddings")]
df_1['gpt2_embeddings'] = [get_gpt2_embeddings(text) for text in tqdm(df_1['text'], desc="Generating GPT-2 Embeddings")]


In [None]:
df_2 = val.copy()
# Apply the functions to the DataFrame
df_2['bert_embeddings'] = [get_bert_embeddings(text) for text in tqdm(df_2['text'], desc="Generating BERT Embeddings")]
df_2['gpt2_embeddings'] = [get_gpt2_embeddings(text) for text in tqdm(df_2['text'], desc="Generating GPT-2 Embeddings")]

In [None]:
def save_as_hdf5(df, name):
    # Save the entire DataFrame as an HDF5 file
    df.to_hdf(f'{name}.h5', key='data', mode='w')

save_as_hdf5(df, 'train_embeddings')
save_as_hdf5(df_1, 'test_embeddings')
save_as_hdf5(df_2, 'val_embeddings')

In [None]:
df


In [None]:
!pip install tables