In [1]:
!pip install transformers datasets pandas torch accelerate



In [2]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments,Trainer, DataCollatorForLanguageModeling
from datasets import Dataset

# Importing and loading dataset

In [3]:
df = pd.read_csv('small_talk_moody_teen_dataset.csv')
print(f"Loaded {len(df)} conversation pairs")

Loaded 2500 conversation pairs


In [4]:
# Data augmentation

# Lowercase version
df_lower = df.copy()
df_lower["user_utterance"] = df_lower["user_utterance"].str.lower()

# Titlecase version
df_title = df.copy()
df_title["user_utterance"] = df_title["user_utterance"].str.title()

# Uppercase version
df_upper = df.copy()
df_upper["user_utterance"] = df_upper["user_utterance"].str.upper()

# Combine all versions and remove duplicates
df_augmented = pd.concat([df, df_lower, df_title, df_upper], ignore_index=True)
print(f"Original dataset: {len(df)} rows")
print(f"Augmented dataset: {len(df_augmented)} rows")
df_augmented.head(), df_augmented.tail()

Original dataset: 2500 rows
Augmented dataset: 10000 rows


(            user_utterance                                       bot_response
 0          You sound upset  I am real enough to be tired of this question....
 1                       Hi  Yeah, hi. I am here, not thrilled, but present...
 2         Nice to meet you  Hello. Let us get this over with. Say what you...
 3                       Hi  Morning, I guess. I did not sleep, again. Can ...
 4  Tell me something silly  Sure, a joke: I walked into a bar and immediat...,
              user_utterance                                       bot_response
 9995  DO YOU HAVE FEELINGS?  I tolerate music and people in small doses. Bo...
 9996           YOU ARE SLOW  Got it. I will dial it down. You can dial down...
 9997          ARE YOU OKAY?  Yeah, I am moody. It is not a glitch, it is a ...
 9998       MUCH APPRECIATED  Sure. It was not exactly hard. Next time try n...
 9999              NICE WORK  Cool. Validation received. Let us keep expecta...)

# Initialising tokeniser and model

In [5]:
model_name = 'gpt2' # Using gpt2(small)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# Add padding token because gpt2 doesn't have one default
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Formatting data for fine-tuning

In [7]:
# Create conversational format: "Human: {prompt}\nBot: {response}<|endoftext|>"
def format_conversation(row):
    return f"Human: {row['user_utterance']}\nBot: {row['bot_response']}{tokenizer.eos_token}"

df_augmented['text'] = df_augmented.apply(format_conversation, axis=1)
df_augmented.head()

Unnamed: 0,user_utterance,bot_response,text
0,You sound upset,I am real enough to be tired of this question....,Human: You sound upset\nBot: I am real enough ...
1,Hi,"Yeah, hi. I am here, not thrilled, but present...","Human: Hi\nBot: Yeah, hi. I am here, not thril..."
2,Nice to meet you,Hello. Let us get this over with. Say what you...,Human: Nice to meet you\nBot: Hello. Let us ge...
3,Hi,"Morning, I guess. I did not sleep, again. Can ...","Human: Hi\nBot: Morning, I guess. I did not sl..."
4,Tell me something silly,"Sure, a joke: I walked into a bar and immediat...","Human: Tell me something silly\nBot: Sure, a j..."


# Tokenising the dataset

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=512,
        padding='max_length'
    )

In [9]:
# Convert to HuggingFace dataset
dataset = Dataset.from_pandas(df_augmented[['text']])
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

# Splitting dataset

In [10]:
# Splitting 90% train, 10% test
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

# Data collector

In [11]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal language modeling, not masked
)

# Configure training arguments

In [12]:

training_args = TrainingArguments(
    output_dir='./moody_gpt2',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=200,
    save_steps=500,
    save_total_limit=1,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to='none'
)

# Initialise trainer

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    data_collator=data_collator,
)

# Training

In [14]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,0.6713,0.345805
400,0.3305,0.279062
600,0.2864,0.258515
800,0.2689,0.258427
1000,0.2651,0.252182
1200,0.2578,0.247486
1400,0.2525,0.248425
1600,0.2532,0.248359
1800,0.2532,0.245607
2000,0.2513,0.243722


TrainOutput(global_step=5625, training_loss=0.30571639489067925, metrics={'train_runtime': 3551.649, 'train_samples_per_second': 12.67, 'train_steps_per_second': 1.584, 'total_flos': 1.175814144e+16, 'train_loss': 0.30571639489067925, 'epoch': 5.0})

# Save model

In [15]:
model.save_pretrained('./moody_gpt2')
tokenizer.save_pretrained('./moody_gpt2')

('./moody_gpt2/tokenizer_config.json',
 './moody_gpt2/special_tokens_map.json',
 './moody_gpt2/vocab.json',
 './moody_gpt2/merges.txt',
 './moody_gpt2/added_tokens.json')

# Load model

In [16]:
# Load the saved model
model_path = './moody_gpt2'

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Set padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Move to device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.eval()

print(f"Model loaded successfully on {device}")

Model loaded successfully on cuda


# Generate response

In [55]:
def generate_response(prompt, max_length=100):
    model.eval()
    input_text = f"Human: {prompt}\nBot:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.8,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3
        )

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    bot_response = response.split('Bot:')[-1].strip()

    # Split into sentences
    import re
    sentences = re.split(r'([.!?])', bot_response)

    # Reconstruct complete sentences (text + punctuation)
    complete_sentences = []
    for i in range(0, len(sentences)-1, 2):
        if i+1 < len(sentences) and sentences[i].strip():
            complete_sentences.append(sentences[i] + sentences[i+1])

    # Keep only first 2 sentences
    if len(complete_sentences) >= 2:
        result = ''.join(complete_sentences[:2]).strip()
    elif len(complete_sentences) == 1:
        result = complete_sentences[0].strip()
    else:
        # If no complete sentence, add period
        result = bot_response + '.' if bot_response else "I'm not sure how to respond."

    return result

# Testing

In [57]:
print("\n--- Testing Fine-tuned Model ---")
test_prompts = [
    "How was school today?",
    "Want to hang out?",
    "You seem upset, what's wrong?"
]

for prompt in test_prompts:
    response = generate_response(prompt)
    print(f"Human: {prompt}")
    print(f"Bot: {response}\n")


--- Testing Fine-tuned Model ---
Human: How was school today?
Bot: Evening. I am not feeling chatty, but I can manage a conversation.

Human: Want to hang out?
Bot: Evening. I am not feeling chatty, but I can manage a conversation.

Human: You seem upset, what's wrong?
Bot: I talk like this because pretending to be excited is exhausting. Honesty is faster.



In [56]:
# Code to download directory for colab. Uncomment to download the model directory
# from google.colab import files

# !zip -r /content/small_talk_model.zip /content/moody_gpt2

# files.download('/content/small_talk_model.zip')

updating: content/moody_gpt2/ (stored 0%)
updating: content/moody_gpt2/vocab.json (deflated 68%)
updating: content/moody_gpt2/checkpoint-5625/ (stored 0%)
updating: content/moody_gpt2/checkpoint-5625/vocab.json (deflated 68%)
updating: content/moody_gpt2/checkpoint-5625/merges.txt (deflated 53%)
updating: content/moody_gpt2/checkpoint-5625/rng_state.pth (deflated 26%)
updating: content/moody_gpt2/checkpoint-5625/scheduler.pt (deflated 62%)
updating: content/moody_gpt2/checkpoint-5625/scaler.pt (deflated 64%)
updating: content/moody_gpt2/checkpoint-5625/special_tokens_map.json (deflated 74%)
updating: content/moody_gpt2/checkpoint-5625/model.safetensors (deflated 7%)
updating: content/moody_gpt2/checkpoint-5625/training_args.bin (deflated 54%)
updating: content/moody_gpt2/checkpoint-5625/trainer_state.json (deflated 81%)
updating: content/moody_gpt2/checkpoint-5625/tokenizer_config.json (deflated 56%)
updating: content/moody_gpt2/checkpoint-5625/config.json (deflated 52%)
updating: cont

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Interactive loop

In [None]:
# def chat():
#     print("Chatbot: Hello! I'm ready to chat. Type 'bye' to exit.\n")

#     while True:
#         # Get user input
#         user_input = input("You: ").strip()

#         # Check for exit condition
#         if user_input.lower() == 'bye':
#             print("Chatbot: Goodbye! Have a great day!")
#             break

#         # Skip empty inputs
#         if not user_input:
#             continue

#         # Generate and display response
#         response = generate_response(user_input)
#         print(f"Chatbot: {response}\n")

# # Start the chat
# chat()