In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

# Dataset with expanded Q&A pairs (add as many as possible)
data = {
    "Question": [
        "What is the name of the company?",
        "Where is Zamu AI based?",
        "What services does Zamu AI provide?",
        "What is the focus area of Zamu AI?",
        "Where is the headquarters of Zamu AI located?",
        "Does Zamu AI offer services related to language models?",
        "Can Zamu AI help with data gathering and training models?",
        "What solutions does Zamu AI provide for AI development?",
        "Is Zamu AI a remote-based company?",
        "What is LangChain, and does Zamu AI work with it?"
    ],
    "Answer": [
        "The name of the company is Zamu AI.",
        "Zamu AI is a remote-based company.",
        "Zamu AI provides AI-related solutions, data gathering, model training, LLM services, and LangChain solutions.",
        "Zamu AI focuses on providing AI solutions, including data gathering and training models.",
        "The headquarters of Zamu AI is located in Peshawar, Pakistan.",
        "Yes, Zamu AI offers services related to large language models (LLMs) and LangChain solutions.",
        "Yes, Zamu AI assists with data gathering and model training services.",
        "Zamu AI provides a range of AI development solutions, including data gathering, training models, and large language models.",
        "Yes, Zamu AI is a remote-based company.",
        "LangChain is a framework for building applications with large language models, and Zamu AI offers solutions that involve LangChain."
    ]
}
# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(data)

# Load model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

# Preprocess the dataset
def preprocess_function(examples):
    inputs = [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples["Question"], examples["Answer"])]
    model_inputs = tokenizer(inputs, padding="max_length", max_length=128, truncation=True)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Fine-tuning settings with lower learning rate and more epochs
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=10,  # Increase number of epochs
    learning_rate=5e-5,   # Lower learning rate
    warmup_steps=100,     # Warmup steps to help with convergence
    save_steps=500,
    save_total_limit=2,
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")




Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Step,Training Loss


('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/vocab.json',
 'fine_tuned_model/merges.txt',
 'fine_tuned_model/added_tokens.json',
 'fine_tuned_model/tokenizer.json')

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Define an expanded and varied Q&A dataset with more explicit and detailed answers
data = {
    "Question": [
        "What is the name of the company?",
        "Where is Zamu AI based?",
        "What services does Zamu AI provide?",
        "What are the main offerings of Zamu AI?",
        "What is the focus area of Zamu AI?",
        "Where is the headquarters of Zamu AI located?",
        "Is Zamu AI a remote-based company?",
        "Does Zamu AI offer services related to large language models?",
        "Can Zamu AI assist with data gathering and training models?",
        "What solutions does Zamu AI provide for AI development?",
        "Who is the target audience for Zamu AI?",
        "What are the primary technologies used by Zamu AI?",
        "What industries does Zamu AI serve?",
        "Can Zamu AI help with digital transformation projects?",
        "What is LangChain, and does Zamu AI work with it?",
    ],
    "Answer": [
        "The name of the company is Zamu AI.",
        "Zamu AI is based in Peshawar, Pakistan, and operates remotely.",
        "Zamu AI provides services including AI model training, data gathering, LLM solutions, and LangChain integration.",
        "The main offerings of Zamu AI are AI solution development, data gathering, and training models for various applications.",
        "Zamu AI focuses on providing advanced AI solutions, data management, and large language models for clients.",
        "The headquarters of Zamu AI is located in Peshawar, Pakistan.",
        "Yes, Zamu AI is a remote-based company, allowing employees to work from anywhere.",
        "Yes, Zamu AI offers services related to large language models (LLMs) and custom model training.",
        "Yes, Zamu AI helps with data gathering and model training as part of its services.",
        "Zamu AI provides end-to-end solutions for AI development, including data preparation, model training, and deployment.",
        "Zamu AI primarily targets businesses seeking AI solutions for operational improvement.",
        "Zamu AI uses technologies such as NLP, deep learning, and large language models to power its solutions.",
        "Zamu AI serves industries like healthcare, finance, retail, and more, providing AI-driven insights.",
        "Yes, Zamu AI supports digital transformation projects using cutting-edge AI technologies.",
        "LangChain is a framework for large language model applications, and Zamu AI utilizes it in developing custom AI solutions.",
    ]
}

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict(data)

# Load Flan-T5 model and tokenizer
model_name = "google/flan-t5-base"  # Use Flan-T5-base for better instruction-following capabilities
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Preprocess the dataset
def preprocess_function(examples):
    inputs = [f"Question: {q} Answer:" for q in examples["Question"]]
    targets = examples["Answer"]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=10,       # Increase for better learning
    learning_rate=1e-5,        # Lower learning rate for gradual fine-tuning
    warmup_steps=100,
    save_steps=500,
    save_total_limit=2,
    report_to="none"  # Disable any external logging
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("fine_tuned_model_flan_t5")
tokenizer.save_pretrained("fine_tuned_model_flan_t5")


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Step,Training Loss


('fine_tuned_model_flan_t5/tokenizer_config.json',
 'fine_tuned_model_flan_t5/special_tokens_map.json',
 'fine_tuned_model_flan_t5/spiece.model',
 'fine_tuned_model_flan_t5/added_tokens.json',
 'fine_tuned_model_flan_t5/tokenizer.json')

In [15]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load a BERT model fine-tuned on question answering (SQuAD)
model_name = "deepset/bert-base-cased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Example questions and contexts
qa_pairs = [
    {
        "question": "What services does Zamu AI provide?",
        "context": "Zamu AI provides AI-related solutions, data gathering, model training, and large language model (LLM) services. It also offers LangChain solutions."
    },
    {
        "question": "Where is Zamu AI based?",
        "context": "Zamu AI is based in Peshawar, Pakistan, and operates as a remote-based company."
    },
    {
        "question": "Can Zamu AI help with large language models?",
        "context": "Yes, Zamu AI provides services related to large language models, including model development, training, and deployment."
    },
    {
        "question": "What is LangChain and how is it used in AI?",
        "context": "LangChain is a framework for building applications with large language models, often used in NLP tasks. Zamu AI uses LangChain for custom AI solution development."
    }
]

# Set up the question answering pipeline with GPU acceleration
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)  # Use GPU

# Test the model on queries
for qa in qa_pairs:
    response = qa_pipeline(question=qa["question"], context=qa["context"])
    print(f"Query: {qa['question']}")
    print(f"Response: {response['answer']}\n")


Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Query: What services does Zamu AI provide?
Response: data gathering, model training, and large language model (LLM) services

Query: Where is Zamu AI based?
Response: Peshawar, Pakistan

Query: Can Zamu AI help with large language models?
Response: Zamu AI provides services

Query: What is LangChain and how is it used in AI?
Response: a framework for building applications with large language models



In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the fine-tuned T5 model
model_path = "fine_tuned_model_t5"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Test the model with sample queries
queries = [
    "What services does Zamu AI provide?",
    "Where is Zamu AI based?",
    "Can Zamu AI help with large language models?",
    "What is LangChain and how is it used in AI?",
]

for query in queries:
    inputs = tokenizer(f"question: {query} answer:", return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=50)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Query: {query}")
    print(f"Response: {response}\n")


Query: What services does Zamu AI provide?
Response: a response:

Query: Where is Zamu AI based?
Response: True

Query: Can Zamu AI help with large language models?
Response: True:

Query: What is LangChain and how is it used in AI?
Response: True:



In [16]:
import os
from datetime import datetime

# Directory names
model_directories = ["fine_tuned_model", "fine_tuned_model_t5", "fine_tuned_model_flan_t5"]

# Print last modified time of each model directory
for directory in model_directories:
    timestamp = os.path.getmtime(directory)
    last_modified = datetime.fromtimestamp(timestamp)
    print(f"{directory} last modified: {last_modified}")


fine_tuned_model last modified: 2024-11-03 07:48:43.362411
fine_tuned_model_t5 last modified: 2024-11-03 07:53:19.966198
fine_tuned_model_flan_t5 last modified: 2024-11-03 08:08:37.509880


In [17]:
import shutil

# Zip the latest model directory
shutil.make_archive("fine_tuned_model_flan_t5", 'zip', "fine_tuned_model_flan_t5")


'/kaggle/working/fine_tuned_model_flan_t5.zip'