In [9]:
pip install datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
pip install Dataset

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import random
import numpy as np
import torch
import json
import os
import copy
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.nn import functional as F
from sentence_transformers import SentenceTransformer
from sklearn.metrics import classification_report
from sklearn.neighbors import KDTree

# Function to load the model and tokenizer
def load_model_tokenizer(model_name, single_precision):
    if model_name == 'meta-llama/llama-2-7b-hf':
        if not single_precision:
            model = AutoModelForCausalLM.from_pretrained(model_name)
        else:
            model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({'pad_token': '<PAD>'})
    model.resize_token_embeddings(len(tokenizer))
    model.config.pad_token_id = tokenizer.pad_token_id
    return model, tokenizer

# Class to represent input examples
class InputExample(object):
    def __init__(self, guid=None, text_a="", text_b="", label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

# Class to process SST2 data
class SST2Processor():
    def __init__(self, classes_in_data):
        self.labels = classes_in_data
        self.label_mapping = {k: i for (i, k) in enumerate(self.labels)}

    def get_examples(self, file_path):
        examples = []
        with open(file_path, encoding='utf-8') as f:
            lines = f.readlines()
            for idx, line in enumerate(lines):
                linelist = line.strip().split('\t')
                text_a = linelist[0]
                label = linelist[1]
                guid = f"{file_path}-{idx}"
                example = InputExample(guid=guid, text_a=text_a, label=self.label_mapping[label])
                examples.append(example)
        return examples

# Function to load dataset
def load_dataset(train_path, test_path, classes_in_data):
    dataset_dict = dict()
    processor = SST2Processor(classes_in_data)
    dataset_dict['train'] = processor.get_examples(train_path)
    dataset_dict['test'] = processor.get_examples(test_path)
    print("Length of train set: ", len(dataset_dict['train']))
    print("Length of test set", len(dataset_dict['test']))
    print("Train example at 0th index: ", dataset_dict['train'][0])
    return dataset_dict

# Function to create KDTree for nearest neighbor search
def get_kdtree(dataset):
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
    train_sentences = [ex.text_a for ex in dataset['train']]
    train_embeddings = sbert_model.encode(train_sentences)
    tree = KDTree(train_embeddings)
    return sbert_model, tree

# Function to get top k examples using KDTree
def get_top_k_examples(test_example, sbert_model, tree, dataset_dict, k=10):
    test_embedding = sbert_model.encode([test_example.text_a])
    _, top_k_indices = tree.query(test_embedding, k=k)
    top_k_examples = [dataset_dict['train'][idx] for idx in top_k_indices[0]]
    return top_k_examples

# Function to perform one-shot prediction
def one_shot_prediction(test_example, one_shot_example, prompt_prefix, prompt_suffix, tokenizer, max_rem_len, prompts, model, class_idx, classes, device):
    # Construct the prompt
    prompt = f'{prompt_prefix}Review: {one_shot_example.text_a}\nSentiment: {classes[one_shot_example.label]}\n\nReview: {test_example.text_a}\n'
    enc = tokenizer.encode_plus(prompt, return_tensors='pt', padding='longest')
    
    # Truncate and pad the input
    for key, enc_value in list(enc.items()):
        enc_value = enc_value[:, :max_rem_len]
        enc[key] = torch.cat([enc_value, prompts[key][:enc_value.shape[0]]], dim=1)
    
    seq_len = enc['input_ids'].shape[1]
    enc = {ky: v.to(device) for ky, v in enc.items()}
    
    # Get model prediction
    with torch.no_grad():
        result = model(**enc).logits
    
    result = result[:, -1, class_idx]
    result = F.softmax(result, dim=1)
    preds = torch.argmax(result, dim=-1)
    confidence = result[0][preds].item()
    
    return seq_len, test_example.label, preds.cpu().item(), confidence

# Function to perform two-shot prediction
def two_shot_prediction(test_example, one_shot_example1, one_shot_example2, prompt_prefix, prompt_suffix, tokenizer, max_rem_len, prompts, model, class_idx, classes, device):
    # Construct the prompt
    prompt = f'{prompt_prefix}Review: {one_shot_example1.text_a}\nSentiment: {classes[one_shot_example1.label]}\n\n'
    prompt += f'Review: {one_shot_example2.text_a}\nSentiment: {classes[one_shot_example2.label]}\n\n'
    prompt += f'Review: {test_example.text_a}\n'
    
    enc = tokenizer.encode_plus(prompt, return_tensors='pt', padding='longest')
    
    # Truncate and pad the input
    for key, enc_value in list(enc.items()):
        enc_value = enc_value[:, :max_rem_len]
        enc[key] = torch.cat([enc_value, prompts[key][:enc_value.shape[0]]], dim=1)
    
    seq_len = enc['input_ids'].shape[1]
    enc = {ky: v.to(device) for ky, v in enc.items()}
    
    # Get model prediction
    with torch.no_grad():
        result = model(**enc).logits
    
    result = result[:, -1, class_idx]
    result = F.softmax(result, dim=1)
    preds = torch.argmax(result, dim=-1)
    confidence = result[0][preds].item()
    
    return seq_len, test_example.label, preds.cpu().item(), confidence

# Function to process a batch of predictions
def pred_batch(splt, prompt_prefix, prompt_suffix, tokenizer, indexes, dataset_dict, sbert_model, tree, max_rem_len, prompts, batch_size, model, class_idx, classes, device):
    test_examples = [dataset_dict[splt][i] for i in indexes if i < len(dataset_dict[splt])]

    all_preds_one_shot = []
    all_labels_one_shot = []
    results_one_shot = []
    seq_lens_one_shot = []

    all_preds_two_shot = []
    all_labels_two_shot = []
    results_two_shot = []
    seq_lens_two_shot = []

    for test_example in test_examples:
        # Get top k similar examples
        top_k_examples = get_top_k_examples(test_example, sbert_model, tree, dataset_dict, k=10)
        
        # One-shot predictions
        for one_shot_example in top_k_examples:
            seq_len, labels, preds, confidence = one_shot_prediction(
                test_example, one_shot_example, prompt_prefix, prompt_suffix, tokenizer, max_rem_len, prompts, model, class_idx, classes, device
            )
            all_labels_one_shot.append(labels)
            all_preds_one_shot.append(preds)
            seq_lens_one_shot.append(seq_len)

            results_one_shot.append({
                'test_instance': test_example.text_a,
                'one_shot_example': one_shot_example.text_a,
                'prediction': preds,
                'correct': 1 if preds == labels else 0,
                'predicted_probability': confidence,
                'sequence_length': seq_len
            })
        
        # Two-shot predictions
        for i in range(len(top_k_examples)):
            for j in range(i+1, len(top_k_examples)):
                one_shot_example1 = top_k_examples[i]
                one_shot_example2 = top_k_examples[j]
                seq_len, labels, preds, confidence = two_shot_prediction(
                    test_example, one_shot_example1, one_shot_example2, prompt_prefix, prompt_suffix, tokenizer, max_rem_len, prompts, model, class_idx, classes, device
                )
                all_labels_two_shot.append(labels)
                all_preds_two_shot.append(preds)
                seq_lens_two_shot.append(seq_len)

                results_two_shot.append({
                    'test_instance': test_example.text_a,
                    'one_shot_example1': one_shot_example1.text_a,
                    'one_shot_example2': one_shot_example2.text_a,
                    'prediction': preds,
                    'correct': 1 if preds == labels else 0,
                    'predicted_probability': confidence,
                    'sequence_length': seq_len
                })

    return all_labels_one_shot, all_preds_one_shot, results_one_shot, all_labels_two_shot, all_preds_two_shot, results_two_shot, seq_lens_one_shot, seq_lens_two_shot

# Function to calculate compatibility
def calculate_compatibility(one_shot_results, two_shot_results):
    compatibility_results = []
    for two_shot in two_shot_results:
        test_instance = two_shot['test_instance']
        example1 = two_shot['one_shot_example1']
        example2 = two_shot['one_shot_example2']
        
        # Find individual one-shot predictions
        pred1 = next(r['correct'] for r in one_shot_results if r['test_instance'] == test_instance and r['one_shot_example'] == example1)
        pred2 = next(r['correct'] for r in one_shot_results if r['test_instance'] == test_instance and r['one_shot_example'] == example2)
        
        # Calculate compatibility
        compatibility = 1 if pred1 == 1 and pred2 == 1 and two_shot['correct'] == 1 else 0
        
        compatibility_results.append({
            'test_instance': test_instance,
            'example1': example1,
            'example2': example2,
            'pred1': pred1,
            'pred2': pred2,
            'pair_pred_AB': two_shot['correct'],
            'compatibility': compatibility
        })
    
    return compatibility_results

# Function to generate triplets with multiple negatives
def generate_triplets_with_multiple_negatives(compatibility_results, num_negatives=5):
    triplets = []
    
    # Group compatibility results by test instance
    grouped_results = {}
    all_examples = set()
    for comp in compatibility_results:
        if comp['test_instance'] not in grouped_results:
            grouped_results[comp['test_instance']] = {'compatible': [], 'incompatible': []}
        
        if comp['compatibility'] == 1:
            grouped_results[comp['test_instance']]['compatible'].append((comp['example1'], comp['example2']))
        else:
            grouped_results[comp['test_instance']]['incompatible'].append((comp['example1'], comp['example2']))
        
        all_examples.add(comp['example1'])
        all_examples.add(comp['example2'])
    
    all_examples = list(all_examples)

    for test_instance, results in grouped_results.items():
        for positive in results['compatible']:
            negatives = []
            
            # First, try to get negatives from incompatible examples
            negatives.extend(random.sample(results['incompatible'], min(num_negatives, len(results['incompatible']))))
            
            # If we don't have enough, add random examples from other queries
            while len(negatives) < num_negatives:
                random_example1 = random.choice(all_examples)
                random_example2 = random.choice(all_examples)
                if (random_example1, random_example2) not in results['compatible'] and (random_example1, random_example2) not in negatives:
                    negatives.append((random_example1, random_example2))
            
            triplets.append({
                'anchor': test_instance,
                'positive': positive,
                'negatives': negatives
            })
    
    return triplets

def main():
    # Configurations
    dataset = 'sst2'
    model_name = 'meta-llama/llama-2-7b-hf'
    single_precision = True
    gpu_id = 0
    train_datapath = 'train.tsv'
    test_datapath = 'test.tsv'
    classes = ['negative', 'positive']
    classes_in_data = ['0', '1']
    prompt_prefix = 'Your task is to judge whether the sentiment of a movie review is positive or negative.\n'
    prompt_suffix = 'Sentiment: '
    batch_size = 10
    
    # Seeds and device setup
    random.seed(42)
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    torch.cuda.set_device(gpu_id)
    device = torch.device('cuda:'+str(gpu_id) if torch.cuda.is_available() else 'cpu')
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    
    # Load model and tokenizer
    model, tokenizer = load_model_tokenizer(model_name, single_precision)
    model.to(device)
    model.eval()

    # Get indices of class label words in the vocab
    class_idx = tuple([tokenizer.encode(clas, add_special_tokens=False)[0] for clas in classes])

    # Load dataset
    dataset_dict = load_dataset(train_datapath, test_datapath, classes_in_data)

    # Model for getting similar demonstrations
    sbert_model, tree = get_kdtree(dataset_dict)

    # Prepare prompts
    prompts = tokenizer.batch_encode_plus([prompt_suffix for _ in range(batch_size)], return_tensors='pt', padding='longest', add_special_tokens=False)
    max_rem_len = model.config.max_position_embeddings - prompts['input_ids'].shape[1]

    all_preds_one_shot = list()
    all_labels_one_shot = list()
    all_results_one_shot = list()
    all_seq_lens_one_shot = list()

    all_preds_two_shot = list()
    all_labels_two_shot = list()
    all_results_two_shot = list()
    all_seq_lens_two_shot = list()

    num_examples = len(dataset_dict['test'])
    for start_idx in tqdm(range(0, num_examples, batch_size)):
        end_idx = min(start_idx + batch_size, num_examples)
        indexes = range(start_idx, end_idx)
        labels_one_shot, preds_one_shot, results_one_shot, labels_two_shot, preds_two_shot, results_two_shot, seq_lens_one_shot, seq_lens_two_shot = pred_batch(
            'test', prompt_prefix, prompt_suffix, tokenizer, indexes, dataset_dict, sbert_model, tree, max_rem_len, prompts, batch_size, model, class_idx, classes, device
        )
        
        all_preds_one_shot.extend(preds_one_shot)
        all_labels_one_shot.extend(labels_one_shot)
        all_results_one_shot.extend(results_one_shot)
        all_seq_lens_one_shot.extend(seq_lens_one_shot)

        all_preds_two_shot.extend(preds_two_shot)
        all_labels_two_shot.extend(labels_two_shot)
        all_results_two_shot.extend(results_two_shot)
        all_seq_lens_two_shot.extend(seq_lens_two_shot)

    # One-shot results
    report_one_shot = classification_report(all_labels_one_shot, all_preds_one_shot, digits=4)
    print('One-shot Classification Report:')
    print(report_one_shot)

    results_df_one_shot = pd.DataFrame(all_results_one_shot)
    results_df_one_shot.to_csv('one_shot_results.csv', index=False)

    # Two-shot results
    report_two_shot = classification_report(all_labels_two_shot, all_preds_two_shot, digits=4)
    print('Two-shot Classification Report:')
    print(report_two_shot)

    results_df_two_shot = pd.DataFrame(all_results_two_shot)
    results_df_two_shot.to_csv('two_shot_results.csv', index=False)

    # Calculate sequence length statistics
    all_seq_lens = all_seq_lens_one_shot + all_seq_lens_two_shot
    max_seq_len = max(all_seq_lens)
    avg_seq_len = sum(all_seq_lens) / len(all_seq_lens)
    percentage_greater_than_avg = sum(1 for x in all_seq_lens if x > avg_seq_len) / len(all_seq_lens) * 100

    print(f"Max sequence length: {max_seq_len}")
    print(f"Average sequence length: {avg_seq_len:.2f}")
    print(f"Percentage of sequences greater than average length: {percentage_greater_than_avg:.2f}%")

    # Calculate compatibility
    compatibility_results = calculate_compatibility(all_results_one_shot, all_results_two_shot)

    # Save compatibility results to a CSV file
    compatibility_df = pd.DataFrame(compatibility_results)
    compatibility_df.to_csv('compatibility_results.csv', index=False)
    triplets = generate_triplets_with_multiple_negatives(compatibility_results, num_negatives=5)

    print(f"Generated {len(triplets)} triplets")

    # Save triplets to a JSON file
    with open('triplets.json', 'w') as f:
        json.dump(triplets, f)

    # Print a sample triplet
    if triplets:
        print("Sample triplet:")
        print(json.dumps(triplets[0], indent=2))
    else:
        print("No triplets generated.")

if __name__ == "__main__":
    main()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Length of train set:  6920
Length of test set 1821
Train example at 0th index:  {
  "guid": "train.tsv-0",
  "label": 1,
  "text_a": "a stirring , funny and finally transporting re-imagining of beauty and the beast and 1930s horror films",
  "text_b": ""
}



  0%|          | 0/183 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 183/183 [1:18:54<00:00, 25.87s/it]


One-shot Classification Report:
              precision    recall  f1-score   support

           0     0.5721    0.6352    0.6020      9120
           1     0.5885    0.5234    0.5541      9090

    accuracy                         0.5794     18210
   macro avg     0.5803    0.5793    0.5780     18210
weighted avg     0.5803    0.5794    0.5781     18210

Two-shot Classification Report:
              precision    recall  f1-score   support

           0     0.9224    0.7924    0.8525     41040
           1     0.8175    0.9332    0.8715     40905

    accuracy                         0.8627     81945
   macro avg     0.8700    0.8628    0.8620     81945
weighted avg     0.8701    0.8627    0.8620     81945

Max sequence length: 239
Average sequence length: 119.26
Percentage of sequences greater than average length: 49.47%
Generated 30510 triplets
Sample triplet:
{
  "anchor": "no movement , no yuks , not much of anything .",
  "positive": [
    "a moving , if uneven , success .",
    

In [2]:
with open('triplets.json', 'r') as f:
  data = json.load(f)

data = pd.DataFrame(data)

In [3]:
import tqdm
from torch.utils.data import DataLoader, Dataset
#from datasets import Dataset

from sentence_transformers import InputExample, LoggingHandler, SentenceTransformer, losses, models, util

import json
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit

In [4]:
data = pd.DataFrame(data)
data = data.explode('negatives')


data[['positive1', 'positive2']]  = pd.DataFrame(data['positive'].tolist(), index= data.index)
data[['negative1', 'negative2']] = pd.DataFrame(data['negatives'].tolist(), index= data.index)

data = data.drop('positive', axis=1)
data = data.drop('negatives', axis=1)
data = data.drop_duplicates()
data = data.reset_index(drop=True)
data

Unnamed: 0,anchor,positive1,positive2,negative1,negative2
0,"no movement , no yuks , not much of anything .","a moving , if uneven , success .",two big things are missing -- anything approac...,darkly funny and frequently insightful .,"... this story gets sillier , not scarier , as..."
1,"no movement , no yuks , not much of anything .","a moving , if uneven , success .",two big things are missing -- anything approac...,clint eastwood 's blood work is a lot like a w...,reign of fire has the disadvantage of also loo...
2,"no movement , no yuks , not much of anything .","a moving , if uneven , success .",two big things are missing -- anything approac...,not all of the stories work and the ones that ...,none of this so-called satire has any sting to...
3,"no movement , no yuks , not much of anything .","a moving , if uneven , success .",two big things are missing -- anything approac...,the riveting performances by the incredibly fl...,"gets the look and the period trappings right ,..."
4,"no movement , no yuks , not much of anything .","a moving , if uneven , success .",two big things are missing -- anything approac...,whether or not ram dass proves as clear and re...,-lrb- breheny 's -rrb- lensing of the new zeal...
...,...,...,...,...,...
152421,"but here 's the real damn : it is n't funny , ...","it 's never laugh-out-loud funny , but it is f...","it 's funny , as the old saying goes , because...",the problem with all of this : it 's not reall...,"it 's mildly amusing , but i certainly ca n't ..."
152422,"but here 's the real damn : it is n't funny , ...","it 's never laugh-out-loud funny , but it is f...","it 's funny , as the old saying goes , because...","none of this is very original , and it is n't ...",and it 's not that funny -- which is just gene...
152423,"but here 's the real damn : it is n't funny , ...","it 's never laugh-out-loud funny , but it is f...","it 's funny , as the old saying goes , because...",but one thing 's for sure : it never comes clo...,"it 's mildly amusing , but i certainly ca n't ..."
152424,"but here 's the real damn : it is n't funny , ...","it 's never laugh-out-loud funny , but it is f...","it 's funny , as the old saying goes , because...",less funny than it should be and less funny th...,"it 's mildly amusing , but i certainly ca n't ..."


In [5]:
train_batch_size = 64
max_seq_length = 300
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
max_passages = 0
num_epochs = 10
pooling = "mean"
negs_to_use = None
warmup_steps = 1000
lr = 2e-5
num_negs_per_system = 5
use_pre_trained_model = False
use_all_queries = False
ce_score_margin = 3.0
model_save_path = 'output/training_triplets'

In [6]:
model = SentenceTransformer(model_name)
model.max_seq_length = max_seq_length

In [7]:
class MSMARCODataset(Dataset):
    def __init__(self, data, tokenizer):
      self.data = data.reset_index(drop=True)
      self.tokenizer = tokenizer

    def __getitem__(self, item):
      row = self.data.iloc[item]
      query = row.anchor
      pos_text = row.positive1 + self.tokenizer.sep_token + row.positive2
      neg_text = row.negative1 + self.tokenizer.sep_token + row.negative2
      return InputExample(texts=[query, pos_text, neg_text])

    def __len__(self):
        return len(self.data)


train_dataset = MSMARCODataset(data, model.tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    use_amp=True,
    checkpoint_path=model_save_path,
    checkpoint_save_steps=len(train_dataloader),
    optimizer_params={"lr": lr},
)

# Save the model
model.save(model_save_path)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,1.2609
1000,1.0557
1500,0.9376
2000,0.8281
2500,0.7201
3000,0.6236
3500,0.5443
4000,0.472
4500,0.4111
5000,0.361


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [1]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from scipy.spatial import KDTree
from itertools import combinations
from tqdm import tqdm
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.nn import functional as F
from sklearn.metrics import classification_report

# Function to create KDTree from training dataset
def get_kdtree(dataset):
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
    train_sentences = dataset['review'].tolist()
    train_embeddings = sbert_model.encode(train_sentences)
    tree = KDTree(train_embeddings)
    return sbert_model, tree, train_embeddings

# Load the trained SentenceTransformer model
sentence_model = SentenceTransformer('output/training_triplets')

# Load the LLaMA-2 model for two-shot prediction
llama_model_name = 'meta-llama/llama-2-7b-hf'
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype=torch.float16)
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
llama_tokenizer.add_special_tokens({'pad_token': '<PAD>'})
llama_model.resize_token_embeddings(len(llama_tokenizer))
llama_model.config.pad_token_id = llama_tokenizer.pad_token_id
llama_model.to('cuda')
llama_model.eval()

# Define the column names
column_names = ['review', 'prediction']

# Load the training data and test data with specified column names
train_data = pd.read_csv('train.tsv', sep='\t', names=column_names)
test_data = pd.read_csv('test.tsv', sep='\t', names=column_names)

# Check the columns of the DataFrame
print(test_data.columns)

# Function to generate all possible pairs
def generate_pairs(neighbors):
    return list(combinations(neighbors, 2))

# Function to predict compatibility scores
def predict_compatibility(model, query, pair):
    query_embedding = model.encode(query, convert_to_tensor=True)
    pair_embeddings = model.encode(pair, convert_to_tensor=True)
    cos_sim = util.pytorch_cos_sim(query_embedding, pair_embeddings)
    return cos_sim.mean().item()

# Function for two-shot prediction  
def two_shot_prediction(test_example, example1, example2, prompt_prefix, prompt_suffix, tokenizer, model, class_idx, classes):
    prompt = f'{prompt_prefix}Review: {example1}\nSentiment: {classes[0]}\n\n'
    prompt += f'Review: {example2}\nSentiment: {classes[1]}\n\n'
    prompt += f'Review: {test_example}\n'
    
    enc = tokenizer.encode_plus(prompt, return_tensors='pt', padding='longest')
    enc = {k: v.to('cuda') for k, v in enc.items()}
    
    with torch.no_grad():
        result = model(**enc).logits
    
    result = result[:, -1, class_idx]
    result = F.softmax(result, dim=1)
    pred = torch.argmax(result, dim=-1)
    confidence = result[0][pred].item()
    
    return pred.cpu().item(), confidence

# Main testing process
def test_model(test_data, train_data, sentence_model, llama_model, llama_tokenizer, k=5):
    results = []
    
    print("Encoding test reviews...")
    test_embeddings = sentence_model.encode(test_data['review'].tolist(), show_progress_bar=True)
    
    print("Creating KDTree for training data...")
    train_sbert_model, kd_tree, train_embeddings = get_kdtree(train_data)
    
    prompt_prefix = 'Your task is to judge whether the sentiment of a movie review is positive or negative.\n'
    prompt_suffix = 'Sentiment: '
    classes = ['negative', 'positive']
    class_idx = tuple([llama_tokenizer.encode(clas, add_special_tokens=False)[0] for clas in classes])
    
    print("Processing test examples...")
    all_preds = []
    all_labels = []
    for idx, row in tqdm(test_data.iterrows(), total=len(test_data)):
        query = row['review']
        query_embedding = sentence_model.encode(query)
        _, neighbor_indices = kd_tree.query(query_embedding, k=k)
        neighbors = train_data.iloc[neighbor_indices]['review'].tolist()
        
        pairs = generate_pairs(neighbors)
        scores = []
        for pair in pairs:
            score = predict_compatibility(sentence_model, query, list(pair))
            scores.append((pair, score))
        
        sorted_pairs = sorted(scores, key=lambda x: x[1], reverse=True)
        top_pair, top_score = sorted_pairs[0]
        
        pred, confidence = two_shot_prediction(
    query, 
    top_pair[0], 
    top_pair[1],
    prompt_prefix, 
    prompt_suffix, 
    llama_tokenizer, 
    llama_model, 
    class_idx, 
    classes
        )
        
        results.append({
            'query': query,
            'original_prediction': row['prediction'],
            'top_example_1': top_pair[0],
            'top_example_2': top_pair[1],
            'compatibility_score': top_score,
            'two_shot_prediction': pred,
            'confidence': confidence
        })
        
        all_preds.append(pred)
        all_labels.append(int(row['prediction']))
    
    return pd.DataFrame(results), all_preds, all_labels

# Run the testing process
results_df, all_preds, all_labels = test_model(test_data, train_data, sentence_model, llama_model, llama_tokenizer)

# Save results
results_df.to_csv('test_results_with_predictions.csv', index=False)
print("Testing completed. Results saved to 'test_results_with_predictions.csv'")

# Generate and print classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['negative', 'positive']))

# Print some statistics
print("\nTop 5 most compatible pairs:")
print(results_df.nlargest(5, 'compatibility_score')[['query', 'top_example_1', 'top_example_2', 'compatibility_score', 'two_shot_prediction', 'confidence']])

print("\nBottom 5 least compatible pairs:")
print(results_df.nsmallest(5, 'compatibility_score')[['query', 'top_example_1', 'top_example_2', 'compatibility_score', 'two_shot_prediction', 'confidence']])

print(f"\nAverage compatibility score: {results_df['compatibility_score'].mean():.4f}")
print(f"Average confidence: {results_df['confidence'].mean():.4f}")

  from tqdm.autonotebook import tqdm, trange


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Index(['review', 'prediction'], dtype='object')
Encoding test reviews...


Batches:   0%|          | 0/57 [00:00<?, ?it/s]

Creating KDTree for training data...
Processing test examples...


  0%|          | 0/1821 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 1821/1821 [05:04<00:00,  5.98it/s]


Testing completed. Results saved to 'test_results_with_predictions.csv'

Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.59      0.71       912
    positive       0.69      0.91      0.79       909

    accuracy                           0.75      1821
   macro avg       0.78      0.75      0.75      1821
weighted avg       0.78      0.75      0.75      1821


Top 5 most compatible pairs:
                                                  query  \
1450                                     what 's next ?   
152                                               no. .   
234       a smart , sweet and playful romantic comedy .   
1551  an instant candidate for worst movie of the ye...   
874                                a very funny movie .   

                                          top_example_1  \
1450                                     what 's next ?   
152                                               no. .   
234   a smart ,

In [8]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from scipy.spatial import KDTree
from itertools import combinations
from tqdm import tqdm
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.nn import functional as F
from sklearn.metrics import classification_report

# Function to create KDTree from training dataset
def get_kdtree(dataset):
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
    train_sentences = dataset['review'].tolist()
    train_embeddings = sbert_model.encode(train_sentences)
    tree = KDTree(train_embeddings)
    return sbert_model, tree, train_embeddings

# Load the trained SentenceTransformer model
sentence_model = SentenceTransformer('output/training_triplets')

# Load the LLaMA-2 model for two-shot prediction
llama_model_name = 'meta-llama/llama-2-7b-hf'
llama_model = AutoModelForCausalLM.from_pretrained(llama_model_name, torch_dtype=torch.float16)
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
llama_tokenizer.add_special_tokens({'pad_token': '<PAD>'})
llama_model.resize_token_embeddings(len(llama_tokenizer))
llama_model.config.pad_token_id = llama_tokenizer.pad_token_id
llama_model.to('cuda')
llama_model.eval()

# Define the column names
column_names = ['review', 'prediction']

# Load the training data and test data with specified column names
train_data = pd.read_csv('train.tsv', sep='\t', names=column_names)
test_data = pd.read_csv('test.tsv', sep='\t', names=column_names)

# Check the columns of the DataFrame
print(test_data.columns)

# Function to generate all possible pairs
def generate_pairs(neighbors):
    return list(combinations(neighbors, 2))

# Function to predict compatibility scores
def predict_compatibility(model, query, pair):
    query_embedding = model.encode(query, convert_to_tensor=True)
    pair_embeddings = model.encode(pair, convert_to_tensor=True)
    cos_sim = util.pytorch_cos_sim(query_embedding, pair_embeddings)
    return cos_sim.mean().item()

# Function for two-shot prediction  
def two_shot_prediction(test_example, example1, sentiment1, example2, sentiment2, prompt_prefix, prompt_suffix, tokenizer, model, class_idx, classes):
    prompt = f'{prompt_prefix}Review: {example1}\nSentiment: {sentiment1}\n\n'
    prompt += f'Review: {example2}\nSentiment: {sentiment2}\n\n'
    prompt += f'Review: {test_example}\n'
    
    enc = tokenizer.encode_plus(prompt, return_tensors='pt', padding='longest')
    enc = {k: v.to('cuda') for k, v in enc.items()}
    
    with torch.no_grad():
        result = model(**enc).logits
    
    result = result[:, -1, class_idx]
    result = F.softmax(result, dim=1)
    pred = torch.argmax(result, dim=-1)
    confidence = result[0][pred].item()
    
    return pred.cpu().item(), confidence

def test_model_our_method(test_data, train_data, sentence_model, llama_model, llama_tokenizer, k=5):
    results = []
    
    print("Encoding test reviews...")
    test_embeddings = sentence_model.encode(test_data['review'].tolist(), show_progress_bar=True)
    
    print("Creating KDTree for training data...")
    train_sbert_model, kd_tree, train_embeddings = get_kdtree(train_data)
    
    prompt_prefix = 'Your task is to judge whether the sentiment of a movie review is positive or negative.\n'
    prompt_suffix = 'Sentiment: '
    classes = ['negative', 'positive']
    class_idx = tuple([llama_tokenizer.encode(clas, add_special_tokens=False)[0] for clas in classes])
    
    print("Processing test examples...")
    all_preds = []
    all_labels = []
    for idx, row in tqdm(test_data.iterrows(), total=len(test_data)):
        query = row['review']
        query_embedding = sentence_model.encode(query)
        _, neighbor_indices = kd_tree.query(query_embedding, k=k)
        neighbors = train_data.iloc[neighbor_indices]['review'].tolist()
        
        pairs = generate_pairs(neighbors)
        scores = []
        for pair in pairs:
            score = predict_compatibility(sentence_model, query, list(pair))
            scores.append((pair, score))
        
        sorted_pairs = sorted(scores, key=lambda x: x[1], reverse=True)
        top_pair, top_score = sorted_pairs[0]
        
        sentiment1 = train_data.loc[train_data['review'] == top_pair[0], 'prediction'].iloc[0]
        sentiment2 = train_data.loc[train_data['review'] == top_pair[1], 'prediction'].iloc[0]
        
        pred, confidence = two_shot_prediction(
    query, 
    top_pair[0], train_data.loc[train_data['review'] == top_pair[0], 'prediction'].iloc[0],
    top_pair[1], train_data.loc[train_data['review'] == top_pair[1], 'prediction'].iloc[0],
    prompt_prefix, 
    prompt_suffix, 
    llama_tokenizer, 
    llama_model, 
    class_idx, 
    classes
)
        
        results.append({
            'query': query,
            'original_prediction': row['prediction'],
            'top_example_1': top_pair[0],
            'top_example_2': top_pair[1],
            'compatibility_score': top_score,
            'two_shot_prediction': pred,
            'confidence': confidence
        })
        
        all_preds.append(pred)
        all_labels.append(int(row['prediction']))
    
    return pd.DataFrame(results), all_preds, all_labels

# Baseline method: random selection of examples
def test_model_baseline(test_data, train_data, llama_model, llama_tokenizer):
    results = []
    
    prompt_prefix = 'Your task is to judge whether the sentiment of a movie review is positive or negative.\n'
    prompt_suffix = 'Sentiment: '
    classes = ['negative', 'positive']
    class_idx = tuple([llama_tokenizer.encode(clas, add_special_tokens=False)[0] for clas in classes])
    
    print("Processing test examples (Baseline)...")
    all_preds = []
    all_labels = []
    for idx, row in tqdm(test_data.iterrows(), total=len(test_data)):
        query = row['review']
        
        # Randomly select two examples from the training set
        random_examples = train_data.sample(n=2)
        example1, sentiment1 = random_examples.iloc[0]['review'], random_examples.iloc[0]['prediction']
        example2, sentiment2 = random_examples.iloc[1]['review'], random_examples.iloc[1]['prediction']
        
        pred, confidence = two_shot_prediction(
            query, 
            example1, sentiment1,
            example2, sentiment2,
            prompt_prefix, 
            prompt_suffix, 
            llama_tokenizer, 
            llama_model, 
            class_idx, 
            classes
        )
        
        results.append({
            'query': query,
            'original_prediction': row['prediction'],
            'random_example_1': example1,
            'random_example_2': example2,
            'two_shot_prediction': pred,
            'confidence': confidence
        })
        
        all_preds.append(pred)
        all_labels.append(int(row['prediction']))
    
    return pd.DataFrame(results), all_preds, all_labels

# Main execution
if __name__ == "__main__":
    # Load data, models, etc. (as in the previous code)
    
    # Run our method
    print("Running our method...")
    results_df_our, all_preds_our, all_labels_our = test_model_our_method(test_data, train_data, sentence_model, llama_model, llama_tokenizer)
    
    # Run baseline method
    print("Running baseline method...")
    results_df_baseline, all_preds_baseline, all_labels_baseline = test_model_baseline(test_data, train_data, llama_model, llama_tokenizer)
    
    # Save results
    results_df_our.to_csv('test_results_our_method.csv', index=False)
    results_df_baseline.to_csv('test_results_baseline.csv', index=False)
    
    # Generate and print classification reports
    print("\nClassification Report (Our Method):")
    report_our = classification_report(all_labels_our, all_preds_our, target_names=['negative', 'positive'])
    print(report_our)
    
    print("\nClassification Report (Baseline):")
    report_baseline = classification_report(all_labels_baseline, all_preds_baseline, target_names=['negative', 'positive'])
    print(report_baseline)
    
    # Calculate and print accuracies
    accuracy_our = sum(p == l for p, l in zip(all_preds_our, all_labels_our)) / len(all_labels_our)
    accuracy_baseline = sum(p == l for p, l in zip(all_preds_baseline, all_labels_baseline)) / len(all_labels_baseline)
    
    print(f"\nAccuracy (Our Method): {accuracy_our:.4f}")
    print(f"Accuracy (Baseline): {accuracy_baseline:.4f}")
    
    # Print some statistics for our method
    print("\nOur Method - Top 5 most compatible pairs:")
    print(results_df_our.nlargest(5, 'compatibility_score')[['query', 'top_example_1', 'top_example_2', 'compatibility_score', 'two_shot_prediction', 'confidence']])
    
    print(f"\nOur Method - Average compatibility score: {results_df_our['compatibility_score'].mean():.4f}")
    print(f"Our Method - Average confidence: {results_df_our['confidence'].mean():.4f}")
    print(f"Baseline - Average confidence: {results_df_baseline['confidence'].mean():.4f}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Index(['review', 'prediction'], dtype='object')
Running our method...
Encoding test reviews...


Batches:   0%|          | 0/57 [00:00<?, ?it/s]

Creating KDTree for training data...
Processing test examples...


100%|██████████| 1821/1821 [05:17<00:00,  5.73it/s]


Running baseline method...
Processing test examples (Baseline)...


100%|██████████| 1821/1821 [01:28<00:00, 20.69it/s]



Classification Report (Our Method):
              precision    recall  f1-score   support

    negative       0.87      0.59      0.70       912
    positive       0.69      0.91      0.78       909

    accuracy                           0.75      1821
   macro avg       0.78      0.75      0.74      1821
weighted avg       0.78      0.75      0.74      1821


Classification Report (Baseline):
              precision    recall  f1-score   support

    negative       0.80      0.41      0.55       912
    positive       0.60      0.90      0.72       909

    accuracy                           0.66      1821
   macro avg       0.70      0.66      0.63      1821
weighted avg       0.70      0.66      0.63      1821


Accuracy (Our Method): 0.7501
Accuracy (Baseline): 0.6557

Our Method - Top 5 most compatible pairs:
                     query   top_example_1                      top_example_2  \
1450        what 's next ?  what 's next ?                     what 's next ?   
152       