<a href="https://colab.research.google.com/github/Morioh/Chatbot/blob/main/Grants_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chatbot using a Transformer Model

In [1]:
!pip install transformers
!pip install transformers torch datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import json

# Load the intents file
with open('/content/intents.json', 'r') as f:
    intents = json.load(f)


In [None]:
from transformers import (
    GPT2Tokenizer,
    GPT2ForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import torch

# --- Step 1: Dataset Preparation ---
examples = [
    "Who is eligible for the grant?",
    "Can I apply for the grant if I have a scholarship?",
    "Are previous recipients eligible for the grant?",
    "How do I apply for the grant?",
    "What documents do I need to submit?",
    "What happens after I apply?",
    "Can I use the grant for cash?",
    "What happens if I don’t meet the conditions?",
    "Who gets priority for the grant?",
    "Do you prioritize new applicants?"
]
labels = [
    0, 0, 0,  # Eligibility
    1, 1, 1,  # Application Process
    2, 2,     # Grant Conditions
    3, 3      # Prioritization
]

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    examples, labels, test_size=0.2, random_state=42
)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

# --- Step 2: Load Model and Tokenizer ---
model_name = "gpt2"
num_labels = len(set(labels))

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Assign a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load GPT-2 model for sequence classification
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Update model configuration to include the padding token ID
model.config.pad_token_id = tokenizer.pad_token_id

# Resize model embeddings to match the updated tokenizer
model.resize_token_embeddings(len(tokenizer))

# --- Step 3: Tokenize the Dataset ---
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# --- Step 4: Define Metrics for Quantitative Evaluation ---
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# --- Step 5: Set Up Training Arguments ---
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none",  # Disable WandB integration
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=10,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if using a GPU
)

# --- Step 6: Train the Model ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# --- Step 7: Evaluate on Validation Set ---
# Generate predictions for validation set
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
outputs = model(**val_encodings)
predictions = torch.argmax(outputs.logits, dim=1).numpy()

# Ensure all expected classes are included in the evaluation
expected_labels = [0, 1, 2, 3]  # Full set of labels
label_names = ["Eligibility", "Application", "Conditions", "Prioritization"]

# Dynamically adjust target names based on present labels
unique_labels = sorted(set(val_labels))
adjusted_target_names = [label_names[label] for label in unique_labels]

# Compute metrics
print("\n--- Validation Metrics ---")
print(classification_report(val_labels, predictions, labels=unique_labels, target_names=adjusted_target_names))

# --- Step 8: Qualitative Evaluation ---
responses = {
    0: "Students on full scholarships are ineligible for this grant. Priority will be given to new candidates, but previous recipients are welcome to apply.",
    1: "To apply, submit all required documents and information through the Financial Aid Office. Ensure your submissions are accurate and verifiable, as unverified information may weaken your application.",
    2: "The grant is not redeemable in cash. ALU reserves the right to withdraw the grant if the conditions are not met.",
    3: "Priority is given to new applicants with the most demonstrated financial need. However, previous recipients may still apply."
}

def match_intent(user_input):
    inputs = tokenizer(user_input, truncation=True, padding=True, max_length=128, return_tensors="pt")
    outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return predicted_label

print("\n--- Qualitative Evaluation ---")
test_cases = [
    {"input": "Who gets priority for the grant?", "expected_label": 3},
    {"input": "How do I apply for the grant?", "expected_label": 1},
    {"input": "Am I eligible to apply?", "expected_label": 0},
]

for case in test_cases:
    predicted_label = match_intent(case["input"])
    print(f"Input: {case['input']}")
    print(f"Predicted Label: {predicted_label}, Expected Label: {case['expected_label']}")
    print(f"Response: {responses[predicted_label]}")
    print()


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.235141,0.5,0.5,0.5,0.5
2,No log,1.35953,0.5,0.333333,0.25,0.5
3,No log,1.489877,0.5,0.333333,0.25,0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
def get_response(user_input):
    # Tokenize the input
    inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True, max_length=128)
    # Get model predictions
    outputs = model(**inputs)
    predicted_label = outputs.logits.argmax(dim=1).item()
    # Map predicted label to intent
    intent = list(label_map.keys())[list(label_map.values()).index(predicted_label)]
    # Get the response from the intents JSON
    for item in intents["intents"]:
        if item["intent"] == intent:
            return item["response"]

# Test the chatbot
user_input = "Who gets priority for the grant?"
response = get_response(user_input)
print(f"Chatbot: {response}")


Chatbot: To apply, submit all required documents and information through the Financial Aid Office. Ensure your submissions are accurate and verifiable, as unverified information may weaken your application.


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize training examples
vectorizer = TfidfVectorizer()
vectorized_examples = vectorizer.fit_transform(examples)

def match_intent(user_input):
    user_vector = vectorizer.transform([user_input])
    similarities = cosine_similarity(user_vector, vectorized_examples)
    best_match_idx = similarities.argmax()
    return labels[best_match_idx]

# Example usage
user_input = "How do I apply for the grant?"
predicted_label = match_intent(user_input)
print(f"Predicted Label: {predicted_label}")


Predicted Label: 1


In [12]:
responses = {
    0: "Students on full scholarships are ineligible for this grant. Priority will be given to new candidates, but previous recipients are welcome to apply.",
    1: "To apply, submit all required documents and information through the Financial Aid Office. Ensure your submissions are accurate and verifiable, as unverified information may weaken your application.",
    2: "The grant is not redeemable in cash. ALU reserves the right to withdraw the grant if the conditions are not met.",
    3: "Priority is given to new applicants with the most demonstrated financial need. However, previous recipients may still apply."
}

print("Chatbot is ready! Type 'exit' to quit.")
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break
    predicted_label = match_intent(user_input)
    print(f"Chatbot: {responses[predicted_label]}")


Chatbot is ready! Type 'exit' to quit.
You: How do I apply for the grant?
Chatbot: To apply, submit all required documents and information through the Financial Aid Office. Ensure your submissions are accurate and verifiable, as unverified information may weaken your application.
You: How is the grant awarded?
Chatbot: Students on full scholarships are ineligible for this grant. Priority will be given to new candidates, but previous recipients are welcome to apply.
You: How long is the grant valid?
Chatbot: Students on full scholarships are ineligible for this grant. Priority will be given to new candidates, but previous recipients are welcome to apply.
You: Who gets priority for the grant?
Chatbot: Priority is given to new applicants with the most demonstrated financial need. However, previous recipients may still apply.
You: exit
Chatbot: Goodbye!
