## Finding the Training Dataset:

Link: https://huggingface.co/datasets/Open-Orca/OpenOrca

Resources:

* https://huggingface.co/docs/hub/datasets-usage
* https://docs.wandb.ai/guides/integrations/huggingface
* https://mlabonne.github.io/blog/notes/Large%20Language%20Models/orca.html

In [1]:
from datasets import load_dataset

dataset = load_dataset("Open-Orca/OpenOrca")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'system_prompt', 'question', 'response'],
        num_rows: 4233923
    })
})


## Filter and Tokenize Instructions Based on Token Count:

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [4]:
MAX_LENGTH = 1024  # Define the maximum sequence length

def filter_and_tokenize(data):
    filtered_data = []
    for item in data:
        tokens = tokenizer.tokenize(item['response'])
        if len(tokens) >= 100:
            # Truncate the sequence if it exceeds the maximum length
            truncated_tokens = tokens[:MAX_LENGTH]
            item['response'] = tokenizer.convert_tokens_to_string(truncated_tokens)
            filtered_data.append(item)
    return filtered_data

# Filter and tokenize the dataset
filtered_train = filter_and_tokenize(dataset['train'])

Token indices sequence length is longer than the specified maximum sequence length for this model (1387 > 1024). Running this sequence through the model will result in indexing errors


In [5]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(filtered_train, test_size=0.2)

print(f"Train data size: {len(train_data)}")
print(f"Validation data size: {len(valid_data)}")

Train data size: 1271780
Validation data size: 317945


## Data Deduplication Using Cosine Similarity:

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import csr_matrix

In [7]:
def deduplicate(data, threshold=0.95):
    responses = [item['response'] for item in data]
    vectorizer = CountVectorizer().fit_transform(responses)
    vectors = csr_matrix(vectorizer)

    unique_data = []
    seen_indices = set()

    for idx in range(vectors.shape[0]):
        if idx in seen_indices:
            continue

        similarities = cosine_similarity(vectors[idx], vectors).flatten()
        similar_indices = np.where(similarities > threshold)[0]

        seen_indices.update(similar_indices)
        unique_data.append(data[idx])

    return unique_data

# Deduplicate the filtered dataset
unique_train = deduplicate(train_data)
unique_valid = deduplicate(valid_data)

## Plot the Token Distribution Graph:

In [None]:
import matplotlib.pyplot as plt

In [None]:
def plot_token_distribution(data):
    token_counts = [len(tokenizer.tokenize(item['response'])) for item in data]
    plt.hist(token_counts, bins=50)
    plt.xlabel('Number of Tokens')
    plt.ylabel('Frequency')
    plt.title('Token Distribution')
    plt.show()

plot_token_distribution(unique_train)

## Publish the Dataset on Huggingface:

## Fine-tuning Mistral Using the New Dataset:

In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

In [None]:
model = AutoModelForCausalLM.from_pretrained("mistral")
tokenizer = AutoTokenizer.from_pretrained("mistral")

def tokenize_function(examples):
    return tokenizer(examples['response'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

trainer.train()

## Integrating Weights and Biases for Monitoring:

In [None]:
import wandb

## Pushing the Merged Model on Huggingface: