<a href="https://colab.research.google.com/github/JaswanthMannem/PreTraing/blob/main/2.Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install accelerate==0.26.1 datasets==2.16.1 fasttext==0.9.2 jupyter==1.0.0 pandas==2.2.0 pyarrow==15.0.0 sentencepiece==0.1.99 torch==2.1.2 torchaudio==2.1.2 torchvision==0.16.2 tqdm==4.66.1 transformers==4.37.2

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data preparation


In [3]:
import datasets
pretraining_dataset = datasets.load_dataset(
    "upstage/Pretraining_Dataset",
    split="train"
)

Downloading data:   0%|          | 0.00/150M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
print(pretraining_dataset)

Dataset({
    features: ['text', 'meta'],
    num_rows: 60000
})


In [5]:
pretraining_dataset = pretraining_dataset.select_columns(["text"])

In [6]:
pretraining_dataset[0]["text"][:500]

'In 1793 Zaman Shah, a grandson of Ahmad Shah Durrani, won a brief war of succession to become ruler of Afghanistan. The support of Painda Khan, chief of the Baraksai branch of the Durrani tribe, was decisive in his victory. In the next fifty year., the brothers of Zaman shah and the sons of Painda Khan were to dominate the affairs of Afghanistan. The Durrani tribe was very large with several branches and numerous clans. 1 Abmad Shah and his successors belonged to the Sadozai clan, but other clan'

In [7]:
instruction_dataset = datasets.load_dataset(
    "c-s-ale/alpaca-gpt4-data",
    split="train"
)

Downloading readme:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/43.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [8]:
print(instruction_dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 52002
})


In [9]:
instruction_dataset[1]

{'instruction': 'What are the three primary colors?',
 'input': '',
 'output': 'The three primary colors are red, blue, and yellow. These colors are called primary because they cannot be created by mixing other colors and all other colors can be made by combining them in various proportions. In the additive color system, used for light, the primary colors are red, green, and blue (RGB).'}

In [10]:
import os
import requests

In [11]:
code_dir="./code"
if not os.path.exists(code_dir):
  os.makedirs(code_dir)

In [12]:
urls = [
    "https://raw.githubusercontent.com/TheAlgorithms/Python/master/searches/double_linear_search_recursion.py",
    "https://raw.githubusercontent.com/KosingZhu/tensorflow/master/tensorflow/python/tools/module_util.py",
    "https://raw.githubusercontent.com/EricRemmerswaal/tensorflow/master/tensorflow/python/distribute/distribute_coordinator_context.py",
    "https://raw.githubusercontent.com/computationalartist/tensorflow/master/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/numpy_mlp.py",
    "https://raw.githubusercontent.com/Van-an/tensorflow/master/tensorflow/python/distribute/coordinator/values.py",
    "https://raw.githubusercontent.com/nkgwer/tensorflow/master/tensorflow/lite/tools/visualize.py",
    "https://raw.githubusercontent.com/gitblazer/youtube-dl/master/youtube_dl/version.py",
    "https://raw.githubusercontent.com/Joshua-Barawa/My-Photos/master/venv/lib/python3.8/site-packages/django/contrib/messages/__init__.py",
    "https://raw.githubusercontent.com/PaliC/pytorch/master/test/fx/test_subgraph_rewriter.py"
]

In [13]:
for url in urls:
  print(f"Working on {url}")
  response = requests.get(url)
  file_name = os.path.basename(url)
  file_path = os.path.join(code_dir, file_name)
  with open(file_path, "wb") as file:
    file.write(response.content)

Working on https://raw.githubusercontent.com/TheAlgorithms/Python/master/searches/double_linear_search_recursion.py
Working on https://raw.githubusercontent.com/KosingZhu/tensorflow/master/tensorflow/python/tools/module_util.py
Working on https://raw.githubusercontent.com/EricRemmerswaal/tensorflow/master/tensorflow/python/distribute/distribute_coordinator_context.py
Working on https://raw.githubusercontent.com/computationalartist/tensorflow/master/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/numpy_mlp.py
Working on https://raw.githubusercontent.com/Van-an/tensorflow/master/tensorflow/python/distribute/coordinator/values.py
Working on https://raw.githubusercontent.com/nkgwer/tensorflow/master/tensorflow/lite/tools/visualize.py
Working on https://raw.githubusercontent.com/gitblazer/youtube-dl/master/youtube_dl/version.py
Working on https://raw.githubusercontent.com/Joshua-Barawa/My-Photos/master/venv/lib/python3.8/site-packages/django/contrib/messages/__init__.py
Working 

In [14]:
files = os.listdir(code_dir)

In [15]:
for file in files:
  print(file)

module_util.py
__init__.py
double_linear_search_recursion.py
distribute_coordinator_context.py
version.py
visualize.py
numpy_mlp.py
test_subgraph_rewriter.py
values.py


In [16]:
code_dataset=[]
for file in files:
  code_dataset.append(
      {
          "text":open(os.path.join(code_dir,file),"r").read()
      }
  )

In [None]:
code_dataset

In [18]:
code_dataset = datasets.Dataset.from_list(code_dataset)
print(code_dataset)

Dataset({
    features: ['text'],
    num_rows: 9
})


In [19]:
dataset = datasets.concatenate_datasets([pretraining_dataset,code_dataset])
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 60009
})


# Data cleaning

In [20]:
dataset.num_rows

60009

## Remove examples that are too short

In [21]:
import heapq

In [22]:
def paragraph_lenght_filter(x):
  lines=x["text"].split("\n")
  if (len(lines)<3 or min(heapq.nlargest(3,[len(line) for line in lines])))<3:
    return False
  return True

In [23]:
dataset = dataset.filter(
    paragraph_lenght_filter,
    load_from_cache_file=False
)

Filter:   0%|          | 0/60009 [00:00<?, ? examples/s]

In [24]:
dataset.num_rows

52357

## Remove repeated text within training examples

In [25]:
def find_duplicates(paragraph):
  unique_x=set()
  duplicate_chars=0
  duplicate_elements=0
  for element in paragraph:
    if element in unique_x:
      duplicate_chars+=len(element)
      duplicate_elements+=1
    else:
      unique_x.add(element)
  return duplicate_elements,duplicate_chars

In [26]:
import re
def paragraph_repetition_filter(x):
  text=x["text"]
  paragraph = re.compile(r"\n{2,}").split(text.strip())
  paragraph_duplicates, character_duplicates = find_duplicates(paragraph)
  if paragraph_duplicates/len(paragraph) > 0.3:
    return False
  if character_duplicates/len(text) > 0.2:
    return False
  return True

In [27]:
dataset =dataset.filter(
    paragraph_repetition_filter,
    load_from_cache_file=False
)

Filter:   0%|          | 0/52357 [00:00<?, ? examples/s]

In [28]:
dataset.num_rows

52327

## Deduplication

In [29]:
def deduplication(ds):
  def dedup_func(x):
    text=x["text"]
    if text in unique_text:
      return False
    else:
      unique_text.add(text)
      return True

  unique_text=set()

  ds=ds.filter(dedup_func,load_from_cache_file=False)
  return ds

In [30]:
dataset = deduplication(dataset)

Filter:   0%|          | 0/52327 [00:00<?, ? examples/s]

In [31]:
dataset.num_rows

43598

## Quality filter - Language

In [32]:
import urllib
from fasttext.FastText import _FastText

def english_language_filter(ds):
  model = _FastText("L2_language_model.bin")

  def is_english(x):
    language, score = model.predict(x["text"].replace("\n",""))
    language = language[0].split("__")[2]
    return score >0.4 and language == "en"

  ds = ds.filter(is_english,load_from_cache_file=False,num_proc=1)
  return ds

dataset = english_language_filter(dataset)



Filter:   0%|          | 0/43598 [00:00<?, ? examples/s]

In [33]:
dataset.num_rows

40474

In [35]:
file_path = "preprocessed_dataset.parquet"
dataset.to_parquet(file_path)

Creating parquet from Arrow format:   0%|          | 0/41 [00:00<?, ?ba/s]

197101804