In [3]:
!pip install transformers datasets torch

Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Using cached torch-2.3.1-cp312-cp312-win_amd64.whl.metadata (26 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Using cached aiohttp-3.9.5-cp312-cp312-win_amd64.whl.metadata (7.7 kB)
Collecting networkx (from torch)
  Using cached networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting mkl<=2021.4.0,>=2021.1

In [4]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
   ---------------------------------------- 0.0/314.1 kB ? eta -:--:--
   ------- -------------------------------- 61.4/314.1 kB 1.6 MB/s eta 0:00:01
   --------------------------- ------------ 215.0/314.1 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 314.1/314.1 kB 2.8 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.32.1


In [3]:
from google.colab import files
uploaded = files.upload()

Saving guvi_data.txt to guvi_data.txt


In [4]:
import os
import re
from transformers import GPT2Tokenizer

def preprocess_data(input_file, output_file, tokenizer_name="gpt2"):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    with open(output_file, 'w', encoding='utf-8') as f:
        for line in lines:
            line = line.lower()

            # Tokenize the line
            token_ids = tokenizer.encode(line, add_special_tokens=False)

            # Convert token IDs back to tokens
            tokenized_line = tokenizer.convert_ids_to_tokens(token_ids)

            # Remove special tokens
            tokenized_line = [token for token in tokenized_line if token not in tokenizer.all_special_tokens]

            # Convert tokens to text using the tokenizer's decode method
            processed_line = tokenizer.decode(token_ids, clean_up_tokenization_spaces=True)

            processed_line = re.sub(r'[^a-zA-Z0-9\s,():%&]', '', processed_line)
            processed_line = re.sub(r'\bgu vi\b', 'guvi', processed_line)

            # Remove extra spaces
            processed_line = re.sub(r'\s+', ' ', processed_line)
            processed_line = re.sub(r'^\s+|\s+$', '', processed_line)
            # Write the processed line to the output file
            f.write(processed_line + "\n")

input_file = "/content/guvi_data.txt"  # Make sure this path is correct
output_file = "processed_data.txt"
preprocess_data(input_file, output_file)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
#from datasets import load_dataset
import torch

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Load dataset
def load_dataset(file_path, tokenizer, block_size=128):
  """Loads a dataset from a text file, tokenizes it, and returns a tokenized dataset.

  Args:
    file_path: Path to the text file.
    tokenizer: Tokenizer to use for tokenization.
    block_size: Maximum block size for tokenization.

  Returns:
    A list of tokenized inputs.
  """

  with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

  # Add padding token if necessary
  if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  tokenized_data = []
  for line in lines:
     # Skip empty lines to avoid potential errors
    if line.strip():
      tokenized_inputs = tokenizer(
        line,
        padding="max_length",
        max_length=block_size,
        truncation=True,
        return_tensors="pt"
      )
      # Check if any token IDs are out of vocabulary and handle them
      # Option 1: Replace unknown tokens with a special token (e.g., [UNK])
      tokenized_inputs['input_ids'] = torch.where(
          tokenized_inputs['input_ids'] >= tokenizer.vocab_size,
          tokenizer.unk_token_id,
          tokenized_inputs['input_ids']
      )

      # Append the tokenized inputs to the list
      tokenized_data.append(tokenized_inputs) # Add this line to append data

  return tokenized_data


input_file = "processed_data.txt"  # Ensure this file is correctly specified
train_dataset = load_dataset(input_file, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Step,Training Loss
500,0.8054
1000,0.5572


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [7]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.18.1%2Bcu118-cp312-cp312-win_amd64.whl (4.9 MB)
     ---------------------------------------- 0.0/4.9 MB ? eta -:--:--
     ---- ----------------------------------- 0.6/4.9 MB 11.5 MB/s eta 0:00:01
     ------- -------------------------------- 0.9/4.9 MB 9.2 MB/s eta 0:00:01
     --------- ------------------------------ 1.2/4.9 MB 8.4 MB/s eta 0:00:01
     ----------- ---------------------------- 1.5/4.9 MB 8.4 MB/s eta 0:00:01
     --------------- ------------------------ 1.9/4.9 MB 7.9 MB/s eta 0:00:01
     ----------------- ---------------------- 2.2/4.9 MB 7.7 MB/s eta 0:00:01
     -------------------- ------------------- 2.5/4.9 MB 7.6 MB/s eta 0:00:01
     ---------------------- ----------------- 2.8/4.9 MB 7.8 MB/s eta 0:00:01
     ------------------------- -------------- 3.1/4.9 MB 6.9 MB/s eta 0:00:01
     ------------------------

  You can safely remove it manually.


     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:25
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:26
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:25
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:25
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:26
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:27
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:26
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:25
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:25
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:25
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:24
     ------------------------------- -------- 2.1/2.7 GB 4.0 MB/s eta 0:02:24
     ------------------------------- -------- 2.1/2.7 GB 4.1 MB/

In [1]:
#!pip install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name_or_path = "./fine_tuned_model"
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# Set the pad_token to eos_token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    # Tokenize the input text with padding
    inputs = tokenizer(seed_text, return_tensors='pt', padding=True, truncation=True)

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.90,
            pad_token_id=tokenizer.eos_token_id  # Ensure padding token is set to eos_token_id
        )

    # Decode the generated text
    generated_texts = []
    for i in range(num_return_sequences):
        generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts

# Test the model
seed_text = input("Enter text: ")
generated_texts = generate_text(model, tokenizer, seed_text, max_length=50, temperature=1.0, num_return_sequences=1)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}")

  from .autonotebook import tqdm as notebook_tqdm


Generated Text 1:
Guvi is an educational platform that is offering online classes in various languages like tamil, telugu, hindi, kannada, bengali, hindi, bengali, hindi english, hindi manga, and more

