# Language Understanding

In [None]:
# Practice
# https://www.kaggle.com/datasets/kreeshrajani/3k-conversations-dataset-for-chatbot?select=Conversation.csv

In [1]:
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Extract and Load the Dataset
def load_and_inspect_dataset(zip_file_path, csv_file_name):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall("extracted_files")
    
    csv_file_path = f"extracted_files/{csv_file_name}"
    df = pd.read_csv(csv_file_path)
    
    # Display basic information
    print("Dataset Info:")
    print(df.info())
    print("\nFirst 5 Rows:")
    print(df.head())
    
    return df

zip_file_path = "Climate Change FAQs.zip"
csv_file_name = "climate_change_faqs.csv"
df = load_and_inspect_dataset(zip_file_path, csv_file_name)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   source     376 non-null    object
 1   faq        376 non-null    object
 2   text_type  376 non-null    object
dtypes: object(3)
memory usage: 8.9+ KB
None

First 5 Rows:
                                              source  \
0  https://www.ipcc.ch/site/assets/uploads/2020/0...   
1  https://www.ipcc.ch/site/assets/uploads/2020/0...   
2  https://www.ipcc.ch/site/assets/uploads/2020/0...   
3  https://www.ipcc.ch/site/assets/uploads/2020/0...   
4  https://www.ipcc.ch/site/assets/uploads/2020/0...   

                                                 faq text_type  
0  If Understanding of the Climate System Has Inc...         q  
1  The models used to calculate the IPCC’s temper...         a  
2               How Do We Know the World Has Warmed?         q  
3  Evidence for a warming worl

In [3]:
# Step 2: Preprocess the Data
def preprocess_data(df):
    import re

    # Clean text data
    def clean_text(text):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"[^a-zA-Z0-9?.!,¿]+", " ", text)  # Remove unwanted characters
        return text.strip()

    df['faq'] = df['faq'].apply(clean_text)

    # Split the data into questions and answers
    questions = df[df['text_type'] == 'q']['faq'].tolist()
    answers = df[df['text_type'] == 'a']['faq'].tolist()

    # Validate lengths
    if len(questions) != len(answers):
        raise ValueError("Mismatch between the number of questions and answers.")
    
    # Log sample data
    print("\nSample Preprocessed Questions:")
    print(questions[:3])
    print("\nSample Preprocessed Answers:")
    print(answers[:3])

    return questions, answers

questions, answers = preprocess_data(df)


Sample Preprocessed Questions:
['if understanding of the climate system has increased, why hasn t the range of temperature projections been reduced?', 'how do we know the world has warmed?', 'have there been any changes in climate extremes?']

Sample Preprocessed Answers:
['the models used to calculate the ipcc s temperature projections agree on the direction of future global change, but the projected size of those changes cannot be precisely predicted. future greenhouse gas ghg emission rates could take any one of many possible trajectories, and some underlying physical processes are not yet completely understood, making them difficult to model. those uncertainties, combined with natural year to year climate variability, produce an uncertainty range in temperature projections. the uncertainty range around projected ghg and aerosol precursor emissions which depend on projections of future social and economic conditions cannot be materially reduced. nevertheless, improved understanding

In [4]:
# Step 3: Prepare Training and Testing Data
def prepare_data(questions, answers, test_size=0.2, random_state=42):
    pairs = list(zip(questions, answers))
    train_pairs, test_pairs = train_test_split(pairs, test_size=test_size, random_state=random_state)
    
    # Log data statistics
    print(f"\nTraining Samples: {len(train_pairs)}, Testing Samples: {len(test_pairs)}")
    print("\nSample Training Pair:")
    print(train_pairs[0])
    
    return train_pairs, test_pairs

train_pairs, test_pairs = prepare_data(questions, answers)# Step 4: Load Pre-Trained Chatbot Model
def load_chatbot_model(model_name="microsoft/DialoGPT-small"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    # Set pad_token if not set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
    
    print(f"Model and tokenizer loaded successfully from {model_name}!")
    return tokenizer, model

tokenizer, model = load_chatbot_model()


Training Samples: 150, Testing Samples: 38

Sample Training Pair:
('why does the temperature record shown on your vital signs page begin at 1880?', 'three of the world s most complete temperature tracking records from nasa s goddard institute for space studies, the national oceanic and atmospheric administration s national climactic data center and the uk meteorological office s hadley centre begin in 1880. prior to 1880, temperature measurements were made with instruments like thermometers. the oldest continuous temperature record is the central england temperature data series, which began in 1659, and the hadley centre has some measurements beginning in 1850, but there are too few data before 1880 for scientists to estimate average temperatures for the entire planet.')
Model and tokenizer loaded successfully from microsoft/DialoGPT-small!


In [5]:
# Step 4: Load Pre-Trained Chatbot Model
def load_chatbot_model(model_name="microsoft/DialoGPT-small"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    # Set pad_token if not set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token
    
    print(f"Model and tokenizer loaded successfully from {model_name}!")
    return tokenizer, model

tokenizer, model = load_chatbot_model()

Model and tokenizer loaded successfully from microsoft/DialoGPT-small!


In [9]:
# Ensure tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 1: Create the ConversationDataset
class ConversationDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length=128):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        question, answer = self.pairs[idx]
        input_ids = self.tokenizer.encode(
            question, truncation=True, max_length=self.max_length, return_tensors="pt"
        ).squeeze()
        target_ids = self.tokenizer.encode(
            answer, truncation=True, max_length=self.max_length, return_tensors="pt"
        ).squeeze()
        return {"input_ids": input_ids, "target_ids": target_ids}

# Create train_dataset
train_dataset = ConversationDataset(train_pairs, tokenizer)

# Step 2: Define collate_fn

def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    target_ids = [item["target_ids"] for item in batch]

    # Calculate max length for padding
    max_len = max(max(seq.size(0) for seq in input_ids), max(seq.size(0) for seq in target_ids))

    # Pad sequences to the same length
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    target_ids_padded = pad_sequence(target_ids, batch_first=True, padding_value=tokenizer.pad_token_id)

    # Create attention masks (optional for further use)
    input_attention_mask = (input_ids_padded != tokenizer.pad_token_id).long()
    target_attention_mask = (target_ids_padded != tokenizer.pad_token_id).long()

    return {
        "input_ids": input_ids_padded,
        "target_ids": target_ids_padded,
        "input_attention_mask": input_attention_mask,
        "target_attention_mask": target_attention_mask,
    }

# Step 3: Create train_loader
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_fn
)

# Step 4: Debugging - Verify batch shapes
print("\nDebugging Train Loader:")
for batch in train_loader:
    print("Batch Input IDs Shape:", batch["input_ids"].shape)
    print("Batch Target IDs Shape:", batch["target_ids"].shape)
    break


Debugging Train Loader:
Batch Input IDs Shape: torch.Size([8, 16])
Batch Target IDs Shape: torch.Size([8, 128])


In [10]:
def fine_tune_model(model, train_loader, tokenizer, epochs=3, lr=5e-5, gradient_accumulation_steps=2):
    """
    Fine-tune the model using the train_loader and handle padding in target_ids.
    """
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        print(f"\nEpoch {epoch + 1}/{epochs}")

        for step, batch in enumerate(tqdm(train_loader, desc="Training")):
            input_ids = batch["input_ids"].to(device)
            target_ids = batch["target_ids"].to(device)

            # Mask padding tokens in the loss
            outputs = model(
                input_ids=input_ids,
                labels=target_ids.masked_fill(target_ids == tokenizer.pad_token_id, -100)  # Ignore padding tokens
            )
            loss = outputs.loss / gradient_accumulation_steps
            running_loss += loss.item()

            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_loader):
                optimizer.step()
                optimizer.zero_grad()

        print(f"Epoch {epoch + 1} Loss: {running_loss / len(train_loader):.4f}")

    # Save the fine-tuned model
    model.save_pretrained("climate_chatbot")
    tokenizer.save_pretrained("climate_chatbot")
    print("\nFine-tuning complete! Model saved as 'climate_chatbot'.")


In [11]:
# Verify that padding tokens are being ignored in the target_ids
for batch in train_loader:
    print("Batch Input IDs Shape:", batch["input_ids"].shape)
    print("Batch Target IDs Shape:", batch["target_ids"].shape)
    print("Target IDs with Padding Masked:")
    print(batch["target_ids"].masked_fill(batch["target_ids"] == tokenizer.pad_token_id, -100))
    break


Batch Input IDs Shape: torch.Size([8, 24])
Batch Target IDs Shape: torch.Size([8, 128])
Target IDs with Padding Masked:
tensor([[10734,  4073,  8971,  ...,  1245,   286,  4258],
        [  368,  7717,  7313,  ...,   416,   262,  7791],
        [ 8117,   389,   734,  ...,  2158,    11,  4258],
        ...,
        [20123,   278,   257,  ...,  6884,   284,   307],
        [ 1169, 45764,   286,  ...,   262,  1109,   326],
        [11110,  1112,  3148,  ...,  -100,  -100,  -100]])


In [12]:
# Validate the Fine-Tuned Model

# Step 7: Validate the Fine-Tuned Model
def test_model(model, tokenizer, test_pairs):
    """
    Test the fine-tuned chatbot model on a few test samples.

    Args:
        model: The fine-tuned model.
        tokenizer: The tokenizer used during training.
        test_pairs: List of (question, answer) pairs from the test set.
    """
    model.eval()  # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    print("\nTesting the fine-tuned model with sample questions:")
    for i, (question, expected_answer) in enumerate(test_pairs[:5]):  # Test with 5 samples
        # Tokenize and encode the question
        input_ids = tokenizer.encode(question, return_tensors="pt").to(device)
        
        # Generate an answer
        output_ids = model.generate(
            input_ids,
            max_length=50,  # Limit the length of the generated answer
            num_beams=5,    # Use beam search for better results
            early_stopping=True  # Stop when the end token is generated
        )
        
        # Decode the generated answer
        predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Print results for evaluation
        print(f"Sample {i + 1}:")
        print(f"Question: {question}")
        print(f"Expected Answer: {expected_answer}")
        print(f"Model's Answer: {predicted_answer}")
        print("-" * 50)

In [13]:
# Test the model on the test set
test_model(model, tokenizer, test_pairs)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Testing the fine-tuned model with sample questions:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 1:
Question: are humans definitely causing global warming?
Expected Answer: just as the world s most respected scientific bodies have confirmed that world is getting hotter, they have also stated that there is strong evidence that humans are driving the warming. the 2005 joint statement from the national academies of brazil, canada, china, france, germany, india, italy, japan, russia, the uk and the us said it is likely that most of the warming in recent decades can be attributed to human activities. countless more recent statements and reports from the world s leading scientific bodies have said the same thing. for example, a 2010 summary of climate science by the royal society stated that there is strong evidence that the warming of the earth over the last half century has been caused largely by human activity, such as the burning of fossil fuels and changes in land use, including agriculture and deforestation. the idea that humans could change the planet s climate may be coun

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 2:
Question: haven t we had global cooling lately?
Expected Answer: the planet did cool slightly from the 1940s to the 1970s, mainly in the northern hemisphere and most likely a result of the post war boom in industrial aerosol pollutants that bounce sunlight away from the earth. despite a flurry of 1970s media reports on an imminent ice age, there was never anything approaching a scientific consensus on the likelihood of further cooling, and it appears that greenhouse warming has long since eclipsed the mid century cool spell. after temperatures reached a new global high in 1998, the following decade saw smaller ups and downs. the absence of another show stopping record led many sceptics and pundits to claim that global warming stopped in 1998. in truth, however, nobody expects the global average temperature to rise smoothly from one year to the next. just as any april in london, new york or beijing will see a few cold snaps, we can expect long term warming to be punctuated by 

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 3:
Question: climate is always changing. how do we determine the causes of observed changes?
Expected Answer: the causes of observed long term changes in climate on time scales longer than a decade are assessed by determining whether the expected fingerprints of different causes of climate change are present in the historical record. these fingerprints are derived from computer model simulations of the different patterns of climate change caused by individual climate forcings. on multi decade time scales, these forcings include processes such as greenhouse gas increases or changes in solar brightness. by comparing the simulated fingerprint patterns with observed climate changes, we can determine whether observed changes are best explained by those fingerprint patterns, or by natural variability, which occurs without any forcing. the fingerprint of human caused greenhouse gas increases is clearly apparent in the pattern of observed 20th century climate change. the observed change

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample 4:
Question: how important is water vapour to climate change?
Expected Answer: as the largest contributor to the natural greenhouse effect, water vapour plays an essential role in the earth s climate. however, the amount of water vapour in the atmosphere is controlled mostly by air temperature, rather than by emissions. for that reason, scientists consider it a feedback agent, rather than a forcing to climate change. anthropogenic emissions of water vapour through irrigation or power plant cooling have a negligible impact on the global climate. water vapour is the primary greenhouse gas in the earth s atmosphere. the contribution of water vapour to the natural greenhouse effect relative to that of carbon dioxide co2 depends on the accounting method, but can be considered to be approximately two to three times greater. additional water vapour is injected into the atmosphere from anthropogenic activities, mostly through increased evaporation from irrigated crops, but also through 

In [14]:
# Build a Real-Time Chatbot Interface

# Step 8: Build a Real-Time Chatbot Interface
def chatbot_response(question, model, tokenizer, max_length=128, num_beams=5):
    """
    Generate a response to a user's question using the fine-tuned model.

    Args:
        question (str): User input/question.
        model: The fine-tuned model.
        tokenizer: The tokenizer used with the model.
        max_length (int): Maximum length of the generated response.
        num_beams (int): Number of beams for beam search.

    Returns:
        str: The chatbot's response.
    """
    model.eval()  # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_ids = tokenizer.encode(question, return_tensors="pt").to(device)
    
    # Generate response
    output_ids = model.generate(
        input_ids, 
        max_length=max_length, 
        num_beams=num_beams, 
        early_stopping=True
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Chatbot Interaction Loop
print("Chatbot is ready! Type 'exit' or 'quit' to end the session.")
while True:
    try:
        user_input = input("You: ").strip()  # Trim any leading/trailing spaces
        if user_input.lower() in ["exit", "quit"]:
            print("Chatbot: Goodbye! Have a great day!")
            break
        response = chatbot_response(user_input, model, tokenizer)
        print(f"Chatbot: {response}")
    except Exception as e:
        print(f"An error occurred: {e}")
        break

Chatbot is ready! Type 'exit' or 'quit' to end the session.


You:  What can we talk about?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: What can we talk about? This is the first I'm hearing of it.


You:  are humans causing climate change?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: are humans causing climate change?


You:  are humans causing global warming?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Chatbot: are humans causing global warming?


You:  quit


Chatbot: Goodbye! Have a great day!


In [17]:
# Evaluate Model Performance

# Ensure required NLTK resources are downloaded
nltk.download('punkt')

# BLEU evaluation function
def compute_bleu_score(pairs, model, tokenizer, device, max_length=128):
    model.eval()
    bleu_scores = []
    smooth_fn = SmoothingFunction().method4

    for question, reference in tqdm(pairs, desc="Evaluating"):
        input_ids = tokenizer.encode(question, return_tensors="pt").to(device)
        output_ids = model.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
        predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        reference_tokens = nltk.word_tokenize(reference)
        predicted_tokens = nltk.word_tokenize(predicted_answer)
        bleu = sentence_bleu([reference_tokens], predicted_tokens, smoothing_function=smooth_fn)
        bleu_scores.append(bleu)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"Average BLEU Score: {avg_bleu:.4f}")
    return avg_bleu

# Define device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Run BLEU evaluation
average_bleu = compute_bleu_score(test_pairs, model, tokenizer, device)
print(f"Average BLEU Score: {average_bleu:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\milto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Evaluating:   0%|          | 0/38 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating:   3%|▎         | 1/38 [00:01<00:37,  1.02s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating:   5%|▌         | 2/38 [00:01<00:27,  1.33it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
S

Average BLEU Score: 0.0050
Average BLEU Score: 0.0050





In [18]:
# Save Model and Artifacts for Deployment

# Save model and tokenizer
model.save_pretrained("climate_chatbot")
tokenizer.save_pretrained("climate_chatbot")
print("Fine-tuned model saved successfully!")

# Export to Hugging Face format for deployment
from transformers import pipeline
chatbot_pipeline = pipeline("text-generation", model="climate_chatbot", tokenizer="climate_chatbot")
chatbot_pipeline.save_pretrained("climate_chatbot_pipeline")
print("Chatbot pipeline saved successfully!")

Fine-tuned model saved successfully!


Device set to use cpu


Chatbot pipeline saved successfully!


In [None]:
# Next Steps:

# Deploy:
# Use the saved pipeline in a web application, API, or cloud-based service for real-world interaction.
# Hugging Face's transformers library makes deploying in a Flask or FastAPI server straightforward.

# Iterate on Fine-Tuning:
# Improve performance by testing hyperparameter changes or adding more relevant training data.

# Optimize Responses:
# Integrate response filtering or re-ranking to improve user interactions.