In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install transformers datasets huggingface_hub

In [None]:
from torch.utils.data import Dataset
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
from huggingface_hub import login
login(token="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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
class UltraChatData(Dataset):
    def __init__(self, path:str, tokenzier, max_length=256) -> None:
        self.data= load_dataset(path)
        print("Dataset is downloaded")
        self.tokenizer = tokenzier
        self.max_length = max_length
        self.X = []
        for i in self.data['train_gen']:
            if len(i['prompt']) > 100:
                i["prompt"] = i["prompt"][0:100]
            length = len(i['messages'])
            if length%2 != 0:
                length-=1
            idx = list(range(0, length, 2))
            for j in idx:
                self.X.append("<startofstring>"+i['prompt']+" "+i['messages'][j]['content']+"<bot>"+i['messages'][j+1]['content']+"<endofstring>")

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        self.X_encoded = self.tokenizer(self.X[idx],
                                        truncation=True,
                                        padding='max_length',
                                        max_length=self.max_length,
                                        return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']
        return self.input_ids, self.attention_mask

In [None]:
model_name = '/content/drive/MyDrive/ChatbotAI/model_B'
learning_rate = 1e-5
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Model is on {device}")

Model is on cuda


In [None]:
print("Building Tokenizer .....")
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "bos_token": "<startofstring>",
    "eos_token": "<endofstring>"
})
tokenizer.add_tokens(["<bot>"])
# tokenizer.save_pretrained("/content/drive/MyDrive/ChatbotAI/model_save/token/")

Building Tokenizer .....


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0

In [None]:
print("Creating Dataset .....", end="")
data = UltraChatData("HuggingFaceH4/ultrachat_200k", tokenizer)
print("Done")
print("Creating Dataloader .....", end="")
mydataloader = DataLoader(data, batch_size=25)
print("Done")

Creating Dataset .....

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.44k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/81.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.4M [00:00<?, ?B/s]

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

Dataset is downloaded
Done
Creating Dataloader .....Done


In [None]:
data.X[5]

'<startofstring>How do people in Hinduism practice daily devotion, and what rituals are involved in the process? Which deity do Hindus worship the most, and why?<bot>:There are many deities that Hindus worship, and the answer to which deity is worshipped the most may vary depending on the geographic, cultural, and philosophical differences within Hinduism. However, Lord Vishnu, Lord Shiva and Goddess Durga are among the most popular and widely worshipped deities in Hinduism.\n\nLord Vishnu is considered one of the most important deities in Hinduism and is often referred to as the preserver of the universe. He is known for his kind and compassionate nature and is worshipped for his ability to maintain and restore balance and harmony in the world.\n\nLord Shiva is known as the destroyer and transformer of the universe. He is also seen as the embodiment of compassion and is worshipped for his spiritual and meditative qualities. Lord Shiva is closely associated with the practice of yoga an

In [None]:
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
model.to(device)
optim = AdamW(model.parameters(), lr = learning_rate)
# model.save_pretrained("/content/drive/MyDrive/ChatbotAI/model_save/model/")

In [None]:
def train(chatdata, model, optim, epochs):
    for i in (range(epochs)):
        for input_ids, attention_mask in tqdm(chatdata, desc=f"Epoch: {i}"):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            optim.zero_grad()
            loss = model(input_ids,
                         attention_mask= attention_mask,
                         labels=input_ids).loss
            loss.backward()
            optim.step()
        # model.save_pretrained("/content/drive/MyDrive/ChatbotAI/model_save/model/")

In [None]:
def infer(inp):
    inp = "<startofstring>"+inp+"<bot>"
    inp = tokenizer(inp, return_tensors='pt')
    input_ids = inp['input_ids'].to(device)
    attention_mask = inp['attention_mask'].to(device)
    output = model.generate(input_ids, attention_mask= attention_mask,
                            top_p=0.9, do_sample=True, temperature=0.8,
                            max_length=1024,num_beams=12,
                            no_repeat_ngram_size = 1,
                            pad_token_id = tokenizer.eos_token_id,
                            eos_token_id = tokenizer.eos_token_id,
                            bos_token_id = tokenizer.bos_token_id,
                            )
    output = output[0]
    output = tokenizer.decode(output, skip_special_tokens=True)
    return output

In [None]:
print("Training model .....", end="")
model.train()
train(mydataloader, model, optim, 5)
print("Done")

Training model .....

Epoch: 0: 100%|██████████| 429/429 [12:10<00:00,  1.70s/it]
Epoch: 1: 100%|██████████| 429/429 [12:13<00:00,  1.71s/it]
Epoch: 2: 100%|██████████| 429/429 [12:13<00:00,  1.71s/it]
Epoch: 3: 100%|██████████| 429/429 [12:13<00:00,  1.71s/it]
Epoch: 4: 100%|██████████| 429/429 [12:13<00:00,  1.71s/it]

Done





In [None]:
model.eval()
while True:
    txt = input()
    if txt != "exit":
      print(infer(txt))
    else:
      break

How do people in Hinduism practice daily devotion, and what rituals are involved in the process?
How do people in Hinduism practice daily devotion, and what rituals are involved in the process? <bot> :I don't have access to real-time data or personal experiences. But I can provide you with an example of a specific ritual that has been practiced by Hindus during their religious festivals such as Independence Day ( कार्), Mahabharata's Last Bhagavad Gurdwaja Pranj Mantri Jyoti Marathasamir Dhruvatracharya Yojana Durga Puja Prakashtava Puri Maitreya Tiwari Chaudhary Srivatsubhai Mukherjee, which involves reciting mantras every day for at least three days each month while abstaining from consuming any intoxicants like khatum Kaushikrabha etc.? Here is some information on how these practices may be incorporated into modern Indian culture through various social media platforms/institutions including Facebook & Twitter!
Do you know how to cook?
Do you know how to cook? <bot> :Sure, here are s

In [None]:
model.save_pretrained("/content/drive/MyDrive/ChatbotAI/model_B")
tokenizer.save_pretrained("/content/drive/MyDrive/ChatbotAI/model_B")

('/content/drive/MyDrive/ChatbotAI/model_B/tokenizer_config.json',
 '/content/drive/MyDrive/ChatbotAI/model_B/special_tokens_map.json',
 '/content/drive/MyDrive/ChatbotAI/model_B/vocab.json',
 '/content/drive/MyDrive/ChatbotAI/model_B/merges.txt',
 '/content/drive/MyDrive/ChatbotAI/model_B/added_tokens.json')