In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

In [2]:
from datasets import load_dataset

dataset = load_dataset('Open-Orca/1million-gpt-4')

Downloading data:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/994896 [00:00<?, ? examples/s]

In [3]:
# Function to filter out short responses
def filter_short_responses(example):
    return len(example['response'].split()) >= 100

# Apply the filter
filtered_dataset = dataset['train'].filter(filter_short_responses)

print(f"Number of examples after filtering: {len(filtered_dataset)}")

Filter:   0%|          | 0/994896 [00:00<?, ? examples/s]

Number of examples after filtering: 399348


In [20]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch

# Initialize the sentence transformer model
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Set the threshold for cosine similarity
threshold = 0.95

# Initialize lists for unique responses and embeddings
unique_responses = []
unique_embeddings = []

def deduplicate_batch(batch, threshold=0.95):
    texts = [example['response'] for example in batch]
    embeddings = embedding_model.encode(texts, convert_to_tensor=True)
    
    local_unique_responses = []
    local_unique_embeddings = []

    if len(unique_embeddings) > 0:
        unique_embeddings_tensor = torch.stack(unique_embeddings)
        
        for idx, embedding in enumerate(embeddings):
            similarities = util.pytorch_cos_sim(embedding, unique_embeddings_tensor)
            if torch.all(similarities < threshold):
                local_unique_embeddings.append(embedding)
                local_unique_responses.append(batch[idx])
    else:
        local_unique_embeddings.extend(embeddings)
        local_unique_responses.extend(batch)
    
    return local_unique_responses, local_unique_embeddings

# Process a single batch
first_batch = next(process_in_batches(filtered_dataset, batch_size=1000))

# Deduplicate the first batch
local_unique_responses, local_unique_embeddings = deduplicate_batch(first_batch, threshold)

# Update the global lists of unique responses and embeddings
unique_responses.extend(local_unique_responses)
unique_embeddings.extend(local_unique_embeddings)

# Print the number of unique examples after deduplication
print(f"Number of unique examples in the first batch after deduplication: {len(unique_responses)}")

# Optionally, print some unique examples to verify
for example in unique_responses[:5]:
    print(f"Source: {example['system_prompt']} {example['question']}")
    print(f"Response: {example['response']}\n")


Number of unique examples in the first batch after deduplication: 1000
Source: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps. Multi-choice question: What is the sentiment of the following tweet?
Tweet: @nikkigreen I told you 
Choose your answer from:
 + negative;
 + positive;
Response: To determine the sentiment of the tweet, we need to analyze it thoroughly.

Tweet: @nikkigreen I told you

Step 1: Identify the words or phrases that carry emotional weight.
In this tweet, there is only one phrase worth examining: "I told you."

Step 2: Determine the sentiment of the identified words or phrases.
"I told you" can carry a variety of sentiments, depending on the context. It could be positive, negative, or neutral.

Step 3: Consider the overall context of the tweet.
Unfortunately, without more context, it is impossible to determine the exact sentiment of t