<a href="https://colab.research.google.com/github/MohammedFarzin/Fine-tuning-llms/blob/main/Fine_tuning_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load and analyze the dataset


In [1]:
# Install libraries
! pip install transformers faiss-gpu datasets sentence_transformers --progress-bar off

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_

In [None]:
from google.colab import userdata

hf_token = userdata.get('huggingface')

In [None]:
# loading the dataset
from datasets import load_dataset

dataset = load_dataset("garage-bAInd/Open-Platypus")
dataset

In [None]:
dataset['train'].to_pandas()

In [None]:
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import seaborn as sns

tokenizer = AutoTokenizer.from_pretrained('unsloth/llama-3-8b')
instruction_token_counts = [len(tokenizer.tokenize(example['instruction'])) for example in dataset['train']]
output_token_counts = [len(tokenizer.tokenize(example['output'])) for example in dataset['train']]
combined_token_counts = [instruction + output for instruction , output in zip(instruction_token_counts, output_token_counts)]
combined_token_counts

In [None]:
def plot_distribution(token_counts, title):
  sns.set_style("whitegrid")
  plt.figure(figsize=(15, 6))
  plt.hist(token_counts, bins=50, color='r', edgecolor='black')
  plt.title(title, fontsize=16)
  plt.xlabel("Number of tokens", fontsize=14)
  plt.ylabel("Number of examples", fontsize=14)
  plt.xticks(fontsize=12)
  plt.yticks(fontsize=12)
  plt.tight_layout()
  plt.show()


plot_distribution(instruction_token_counts, "Distribution of token counts for instruction only")
plot_distribution(output_token_counts, "Distribution of token counts for output only")
plot_distribution(combined_token_counts, "Distribution of token counts for combined only")

## Filtering out rows

In [None]:
valid_indices = [i for i, count in enumerate(combined_token_counts) if count <= 2048]
dataset['train'] = dataset['train'].select(valid_indices)
token_counts = [combined_token_counts[i] for i in valid_indices]

plot_distribution(token_counts, "Distribution fo token counts for combined after filtering")

## Dedupliction using embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
from datasets import Dataset, DatasetDict
from tqdm.autonotebook import tqdm
import numpy as np

def deduplicate_dataset(dataset: Dataset, model:str, threshold:float):
  sentence_model = SentenceTransformer(model)
  outputs = [example['output'] for example in dataset['train']]

  print("Converting text to embeddings...")
  embeddings = sentence_model.encode(outputs)
  dimension = embeddings.shape[1]
  index = faiss.IndexFlatIP(dimension)
  normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
  index.add(normalized_embeddings)

  print("Filtering  out near-duplicate.....")
  D, I = index.search(normalized_embeddings, k=2)
  print(D, I)
  to_keep = []
  for i in tqdm(range(len(embeddings)), desc="Filtering"):
    print(D[0, 0], D[0, 1])
    if D[i, 1] < threshold:
      to_keep.append(i)
  dataset = dataset['train'].select(to_keep)
  return DatasetDict({"train": dataset})

deduped_dataset = deduplicate_dataset(dataset, "Alibaba-NLP/gte-Qwen2-1.5B-instruct", 0.95)




