In [2]:
import datasets
from datasets import load_dataset
import pandas as pd
import numpy as np
from transformers import AutoTokenizer

In [3]:
text = "I am Hamza Ali. I am from Pakistan"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
encoded_text = tokenizer(text)["input_ids"]
encoded_text

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[42, 717, 5516, 4019, 14355, 15, 309, 717, 432, 13115]

In [4]:
decode_text = tokenizer.decode(encoded_text)
decode_text




'I am Hamza Ali. I am from Pakistan'

In [5]:
sentence1 = "I am a Muslim"
sentence2 = "I pray 5 times a day"
sentence3 = "I always speaks truth and eat health"
sentence4 = "I get up early in the morning and sleep to bed at 12PM"

In [6]:
tokenizer.pad_token = tokenizer.eos_token
list1 = [sentence1, sentence2, sentence3, sentence4]
for i in range(len(list1)):
    encoded_sentences = tokenizer(list1[i], padding=True)
    print(encoded_sentences)

{'input_ids': [42, 717, 247, 8797], 'attention_mask': [1, 1, 1, 1]}
{'input_ids': [42, 12518, 608, 2069, 247, 1388], 'attention_mask': [1, 1, 1, 1, 1, 1]}
{'input_ids': [42, 1900, 16544, 5083, 285, 6008, 1786], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [42, 755, 598, 2393, 275, 253, 4131, 285, 4600, 281, 3722, 387, 1249, 9122], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [7]:
tokenizer.pad_token = tokenizer.eos_token
list1 = [sentence1, sentence2, sentence3, sentence4]
for i in range(len(list1)):
    encoded_sentences = tokenizer(list1[i], padding=True, max_length=5, truncation=True)
    print(encoded_sentences)

{'input_ids': [42, 717, 247, 8797], 'attention_mask': [1, 1, 1, 1]}
{'input_ids': [42, 12518, 608, 2069, 247], 'attention_mask': [1, 1, 1, 1, 1]}
{'input_ids': [42, 1900, 16544, 5083, 285], 'attention_mask': [1, 1, 1, 1, 1]}
{'input_ids': [42, 755, 598, 2393, 275], 'attention_mask': [1, 1, 1, 1, 1]}


### This is the instruction dataset

In [8]:
filename = "lamini_docs.jsonl"
instruction_dataset_df = pd.read_json(filename, lines=True)
examples = instruction_dataset_df.to_dict()

if "question" in examples and "answer" in examples:
  text = examples["question"][0] + examples["answer"][0]
elif "instruction" in examples and "response" in examples:
  text = examples["instruction"][0] + examples["response"][0]
elif "input" in examples and "output" in examples:
  text = examples["input"][0] + examples["output"][0]
else:
  text = examples["text"][0]

prompt_template = """### Question:
{question}

### Answer:"""

num_examples = len(examples["question"])
finetuning_dataset = []
for i in range(num_examples):
  question = examples["question"][i]
  answer = examples["answer"][i]
  text_with_prompt_template = prompt_template.format(question=question)
  finetuning_dataset.append({"question": text_with_prompt_template, "answer": answer})

from pprint import pprint
print("One datapoint in the finetuning dataset:")
pprint(finetuning_dataset[0])

One datapoint in the finetuning dataset:
{'answer': 'Lamini has documentation on Getting Started, Authentication, '
           'Question Answer Model, Python Library, Batching, Error Handling, '
           'Advanced topics, and class documentation on LLM Engine available '
           'at https://lamini-ai.github.io/.',
 'question': '### Question:\n'
             'What are the different types of documents available in the '
             'repository (e.g., installation guide, API documentation, '
             "developer's guide)?\n"
             '\n'
             '### Answer:'}


In [15]:
tokenxed_1 = tokenizer(finetuning_dataset[0]['question'] + finetuning_dataset[0]['answer'] , return_tensors="np", padding=True)
tokenxed_1["input_ids"]

array([[ 4118, 19782,    27,   187,  1276,   403,   253,  1027,  3510,
          273,  7177,  2130,   275,   253, 18491,   313,    70,    15,
           72,   904, 12692,  7102,    13,  8990, 10097,    13, 13722,
          434,  7102,  6177,   187,   187,  4118, 37741,    27,    45,
         4988,    74,   556, 10097,   327, 27669, 11075,   264,    13,
         5271, 23058,    13, 19782, 37741, 10031,    13, 13814, 11397,
           13,   378, 16464,    13, 11759, 10535,  1981,    13, 21798,
        12989,    13,   285,   966, 10097,   327, 21708,    46, 10797,
         2130,   387,  5987,  1358,    77,  4988,    74,    14,  2284,
           15,  7280,    15,   900, 14206]])

In [17]:
max_length = 2048
max_length = min(
    tokenxed_1["input_ids"].shape[1],
    max_length,
)

In [19]:
tokenxed_1 = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=max_length
)

In [20]:
print(tokenxed_1["input_ids"])

[[ 1276   403   253  1027  3510   273  7177  2130   275   253 18491   313
     70    15    72   904 12692  7102    13  8990 10097    13 13722   434
   7102  6177    45  4988    74   556 10097   327 27669 11075   264    13
   5271 23058    13 19782 37741 10031    13 13814 11397    13   378 16464
     13 11759 10535  1981    13 21798 12989    13   285   966 10097   327
  21708    46 10797  2130   387  5987  1358    77  4988    74    14  2284
     15  7280    15   900 14206]]
