In [1]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [None]:
from transformers import GPT2Tokenizer, GPT2Model
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import torch
import numpy as np
from tqdm import tqdm

# Load GPT-2 model and tokenizer
model_name = "gpt2"  # You can use other variants like gpt2-medium, gpt2-large, gpt2-xl if needed
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2Model.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token  # Reuse eos_token for padding


# Move model to GPU (if available)
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load IMDb dataset
dataset = load_dataset("imdb")

# Function to get GPT-2 embeddings
def get_gpt2_embeddings(texts):
    model.eval()  # Set model to evaluation mode
    all_embeddings = []
    
    with torch.no_grad():
        for text in tqdm(texts):
            # Tokenize the input text
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
            
            # Get the model's hidden states (embeddings)
            outputs = model(**inputs)
            
            # Get the last hidden state (representing the token embeddings)
            hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
            
            # We can use the embedding of the [CLS] token (usually the first token) or average all token embeddings
            # Use the mean of all token embeddings (average pooling)
            embeddings = hidden_states.mean(dim=1).cpu().numpy()  # Shape: [batch_size, hidden_size]
            all_embeddings.append(embeddings)
    
    return np.vstack(all_embeddings)

# Get GPT-2 embeddings for training and test set
train_texts = dataset["train"]["text"]
test_texts = dataset["test"]["text"]

# Get embeddings for train and test texts
train_embeddings = get_gpt2_embeddings(train_texts)
test_embeddings = get_gpt2_embeddings(test_texts)

100%|██████████| 25000/25000 [05:18<00:00, 78.38it/s]
100%|██████████| 25000/25000 [05:11<00:00, 80.16it/s]


Classification accuracy using GPT-2 embeddings: 89.36%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
accuracy

0.89364

In [None]:

# Use Logistic Regression to classify sentiment (0 = negative, 1 = positive)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(train_embeddings, dataset["train"]["label"])

# Make predictions on the test set
predictions = classifier.predict(test_embeddings)

# Evaluate accuracy
accuracy = accuracy_score(dataset["test"]["label"], predictions)
print(f"Classification accuracy using GPT-2 embeddings: {accuracy * 100:.2f}%")

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [11]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
import numpy as np
import json
from tqdm import tqdm

# Choose model type: 'bert-base-uncased' or 'gpt2'
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
    model.to(device)
    model.eval()
    return tokenizer, model

# Extract embeddings using mean pooling
def get_embeddings(texts, tokenizer, model, max_len=256):
    embeddings = []
    with torch.no_grad():
        for text in tqdm.tqdm(texts):
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=max_len).to(device)
            outputs = model(**inputs)
            last_hidden = outputs.last_hidden_state
            pooled = last_hidden.mean(dim=1).squeeze().cpu().numpy()
            embeddings.append(pooled)
    return np.array(embeddings)

import tqdm

# Extract and save embeddings for multiple datasets
def process_datasets(model_name, output_prefix, num_samples=1000):
    tokenizer, model = load_model_and_tokenizer(model_name)
    train_data, test_data = {}, {}

    for task in tqdm.tqdm(classification_tasks):
        print(f"\nProcessing dataset: {task}")
        try:
            dataset = load_dataset(task)
            if "train" not in dataset or "test" not in dataset:
                print(f"Skipping {task} due to missing train/test split.")
                continue

            # Sample data (limit size for faster processing)
            train_indices = (
                np.arange(len(dataset["train"])) if len(dataset["train"]) < num_samples
                else np.random.permutation(len(dataset["train"]))[:num_samples]
            )
            train = dataset["train"].select(train_indices)
            test_indices = (
                np.arange(len(dataset["test"])) if len(dataset["test"]) < num_samples
                else np.random.permutation(len(dataset["test"]))[:num_samples]
            )
            test = dataset["test"].select(test_indices)

            # Attempt to use standard field names
            text_field = "text"
            for field in ["sentence", "content", "question", "review", "comment"]:
                if field in train.column_names:
                    text_field = field
                    break

            train_emb = get_embeddings(train[text_field], tokenizer, model)
            test_emb = get_embeddings(test[text_field], tokenizer, model)
            train_labels = np.array(train["label"])
            test_labels = np.array(test["label"])

            train_data[task] = [train_emb.tolist(), train_labels.tolist()]
            test_data[task] = [test_emb.tolist(), test_labels.tolist()]
        except Exception as e:
            print(f"Error processing {task}: {e}")

    # Save to JSON
    with open(f"{output_prefix}_train_sets.json", "w") as f:
        json.dump(train_data, f)
    with open(f"{output_prefix}_test_sets.json", "w") as f:
        json.dump(test_data, f)

# Device setup
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# Tasks to process
classification_tasks = [
    "imdb", "ag_news", "yelp_polarity", "dbpedia_14",
    "amazon_polarity", "emotion", "rotten_tomatoes"
]

# Run for both models
process_datasets("gpt2", "GPT2", num_samples=1000)
process_datasets("bert-base-uncased", "BERT", num_samples=1000)

Using pad_token, but it is not set yet.
  0%|          | 0/7 [00:00<?, ?it/s]


Processing dataset: imdb


100%|██████████| 1000/1000 [00:10<00:00, 92.54it/s]
100%|██████████| 1000/1000 [00:10<00:00, 92.66it/s]
 14%|█▍        | 1/7 [00:25<02:30, 25.10s/it]


Processing dataset: ag_news


100%|██████████| 1000/1000 [00:10<00:00, 97.96it/s]
100%|██████████| 1000/1000 [00:10<00:00, 98.17it/s]
 29%|██▊       | 2/7 [00:48<01:59, 23.88s/it]


Processing dataset: yelp_polarity


100%|██████████| 1000/1000 [00:10<00:00, 95.51it/s]
100%|██████████| 1000/1000 [00:10<00:00, 95.54it/s]
 43%|████▎     | 3/7 [01:12<01:35, 24.00s/it]


Processing dataset: dbpedia_14


100%|██████████| 1000/1000 [00:10<00:00, 97.63it/s]
100%|██████████| 1000/1000 [00:10<00:00, 97.56it/s]
 57%|█████▋    | 4/7 [01:35<01:11, 23.80s/it]


Processing dataset: amazon_polarity


100%|██████████| 1000/1000 [00:10<00:00, 97.42it/s]
100%|██████████| 1000/1000 [00:10<00:00, 97.42it/s]
 71%|███████▏  | 5/7 [01:59<00:47, 23.66s/it]


Processing dataset: emotion


100%|██████████| 1000/1000 [00:10<00:00, 98.82it/s]
100%|██████████| 1000/1000 [00:10<00:00, 98.86it/s]
 86%|████████▌ | 6/7 [02:27<00:25, 25.14s/it]


Processing dataset: rotten_tomatoes


100%|██████████| 1000/1000 [00:10<00:00, 98.57it/s]
100%|██████████| 1000/1000 [00:10<00:00, 98.60it/s]
100%|██████████| 7/7 [02:49<00:00, 24.24s/it]
  0%|          | 0/7 [00:00<?, ?it/s]


Processing dataset: imdb


100%|██████████| 1000/1000 [00:09<00:00, 103.76it/s]
100%|██████████| 1000/1000 [00:09<00:00, 104.09it/s]
 14%|█▍        | 1/7 [00:23<02:18, 23.04s/it]


Processing dataset: ag_news


100%|██████████| 1000/1000 [00:08<00:00, 111.67it/s]
100%|██████████| 1000/1000 [00:08<00:00, 112.43it/s]
 29%|██▊       | 2/7 [00:44<01:49, 21.83s/it]


Processing dataset: yelp_polarity


100%|██████████| 1000/1000 [00:09<00:00, 108.19it/s]
100%|██████████| 1000/1000 [00:09<00:00, 108.64it/s]
 43%|████▎     | 3/7 [01:05<01:26, 21.75s/it]


Processing dataset: dbpedia_14


100%|██████████| 1000/1000 [00:08<00:00, 111.36it/s]
100%|██████████| 1000/1000 [00:08<00:00, 111.30it/s]
 57%|█████▋    | 4/7 [01:26<01:04, 21.56s/it]


Processing dataset: amazon_polarity


100%|██████████| 1000/1000 [00:09<00:00, 110.73it/s]
100%|██████████| 1000/1000 [00:09<00:00, 110.59it/s]
 71%|███████▏  | 5/7 [01:49<00:43, 21.80s/it]


Processing dataset: emotion


100%|██████████| 1000/1000 [00:08<00:00, 113.28it/s]
100%|██████████| 1000/1000 [00:08<00:00, 113.16it/s]
 86%|████████▌ | 6/7 [02:10<00:21, 21.52s/it]


Processing dataset: rotten_tomatoes


100%|██████████| 1000/1000 [00:08<00:00, 112.77it/s]
100%|██████████| 1000/1000 [00:08<00:00, 112.85it/s]
100%|██████████| 7/7 [02:29<00:00, 21.42s/it]
