In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import html
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import unicodedata
import re

class QABertModel(torch.nn.Module):
    def __init__(self, model_name='distilbert-base-uncased'):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        # Use [CLS] token embedding as the sentence representation
        return outputs.last_hidden_state[:, 0, :]


class HTMLStripper(HTMLParser):
    """Custom HTML Parser for stripping HTML tags"""
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = []

    def handle_data(self, d):
        self.text.append(d)

    def get_data(self):
        return ' '.join(self.text)

def preprocess_text(text):
    """
    Comprehensive text preprocessing function
    """
    if not isinstance(text, str):
        return ""

    # Convert to string and lowercase
    text = str(text).lower()

    # Decode HTML entities
    text = html.unescape(text)

    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Additional HTML cleaning using custom stripper
    stripper = HTMLStripper()
    stripper.feed(text)
    text = stripper.get_data()

    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text)

    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^a-z0-9\s.,!?-]', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    # Fix common contractions
    contractions = {
        "won't": "will not",
        "can't": "cannot",
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'t": " not",
        "'ve": " have",
        "'m": " am"
    }

    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)

    return text.strip()

def load_and_preprocess_data(file_path):
    """
    Load and preprocess the dataset
    """
    print("Loading and preprocessing data...")

    # Read the data
    df = pd.read_csv(file_path, on_bad_lines='skip', quoting=2).iloc[0:20]
    print(f"Loaded DataFrame columns: {df.columns}")

    # Make sure question and answer columns exist
    required_columns = ['Question Body', 'Answer Body']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    # Preprocess questions and answers
    print("Preprocessing questions...")
    df['Question Body'] = df['Question Body'].progress_apply(preprocess_text)

    print("Preprocessing answers...")
    df['Answer Body'] = df['Answer Body'].progress_apply(preprocess_text)

    # Remove rows where question or answer is empty after preprocessing
    df = df.dropna(subset=['Question Body', 'Answer Body'])
    df = df[df['Question Body'].str.strip() != '']
    df = df[df['Answer Body'].str.strip() != '']

    print(f"Dataset size after preprocessing: {len(df)} rows")

    return df

def train_model(model, train_dataloader, val_dataloader, device, epochs=3):
    """
    Train model using cosine similarity loss between questions and answers
    """
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Calculate total training steps
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # Training loop
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        print('-' * 10)

        model.train()
        total_train_loss = 0

        for batch in tqdm(train_dataloader):
            # Get question and answer inputs
            question_input_ids = batch['question_input_ids'].to(device)
            question_attention_mask = batch['question_attention_mask'].to(device)

            answer_input_ids = batch['answer_input_ids'].to(device)
            answer_attention_mask = batch['answer_attention_mask'].to(device)

            model.zero_grad()

            # Get embeddings for questions and answers
            question_embeddings = model(
                input_ids=question_input_ids,
                attention_mask=question_attention_mask,
            )

            answer_embeddings = model(
                input_ids=answer_input_ids,
                attention_mask=answer_attention_mask,
            )

            # Calculate similarity loss
            loss = cosine_similarity_loss(question_embeddings, answer_embeddings)
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f'Average training loss: {avg_train_loss}')

        # Validation
        model.eval()
        total_val_loss = 0

        for batch in tqdm(val_dataloader):
            question_input_ids = batch['question_input_ids'].to(device)
            question_attention_mask = batch['question_attention_mask'].to(device)

            answer_input_ids = batch['answer_input_ids'].to(device)
            answer_attention_mask = batch['answer_attention_mask'].to(device)

            with torch.no_grad():
                question_embeddings = model(
                    input_ids=question_input_ids,
                    attention_mask=question_attention_mask,
                )

                answer_embeddings = model(
                    input_ids=answer_input_ids,
                    attention_mask=answer_attention_mask,
                )

                loss = cosine_similarity_loss(question_embeddings, answer_embeddings)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f'Average validation loss: {avg_val_loss}')


def evaluate_model(model, test_dataloader, device):
    model.eval()
    total_loss = 0
    all_similarities = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            question_input_ids = batch['question_input_ids'].to(device)
            question_attention_mask = batch['question_attention_mask'].to(device)

            answer_input_ids = batch['answer_input_ids'].to(device)
            answer_attention_mask = batch['answer_attention_mask'].to(device)

            question_embeddings = model(
                input_ids=question_input_ids,
                attention_mask=question_attention_mask,
            )

            answer_embeddings = model(
                input_ids=answer_input_ids,
                attention_mask=answer_attention_mask,
            )

            # Calculate similarities
            q_norm = F.normalize(question_embeddings, p=2, dim=1)
            a_norm = F.normalize(answer_embeddings, p=2, dim=1)
            similarities = torch.sum(q_norm * a_norm, dim=1)
            all_similarities.extend(similarities.cpu().numpy())

    return {
        'mean_similarity': np.mean(all_similarities),
        'median_similarity': np.median(all_similarities),
        'std_similarity': np.std(all_similarities)
    }

def cosine_similarity_loss(question_embeddings, answer_embeddings):
    """
    Compute cosine similarity loss between question and answer embeddings
    """
    # Normalize embeddings
    question_embeddings_norm = F.normalize(question_embeddings, p=2, dim=1)
    answer_embeddings_norm = F.normalize(answer_embeddings, p=2, dim=1)

    # Compute cosine similarity
    similarity = torch.sum(question_embeddings_norm * answer_embeddings_norm, dim=1)

    # Convert similarity to loss (1 - similarity to minimize)
    loss = 1 - similarity.mean()

    return loss

class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length=512):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = str(self.questions[idx])
        answer = str(self.answers[idx])

        # Tokenize question and answer separately
        question_encoding = self.tokenizer(
            question,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        answer_encoding = self.tokenizer(
            answer,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'question_input_ids': question_encoding['input_ids'].squeeze(),
            'question_attention_mask': question_encoding['attention_mask'].squeeze(),
            'answer_input_ids': answer_encoding['input_ids'].squeeze(),
            'answer_attention_mask': answer_encoding['attention_mask'].squeeze(),
        }

# [Previous code for cosine_similarity_loss, QABertModel, train_model, and evaluate_model remains exactly the same]

def save_evaluation_results(results, filename='evaluation_results.txt'):
    """
    Save evaluation results to a file
    """
    with open(filename, 'w') as f:
        f.write("Evaluation Results\n")
        f.write("=================\n\n")
        for metric, value in results.items():
            f.write(f"{metric}: {value:.4f}\n")

def main():
    # Enable progress bar for pandas operations
    tqdm.pandas()

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    try:
        # Load and preprocess data
        df = load_and_preprocess_data("/content/VIsa_Questions_Stack_Exchange_V2.csv")

        # Print some preprocessing statistics
        print("\nPreprocessing Statistics:")
        print(f"Average question length: {df['Question Body'].str.len().mean():.2f} characters")
        print(f"Average answer length: {df['Answer Body'].str.len().mean():.2f} characters")

        # Split data into train, validation, and test sets
        train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
        val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

        print("\nDataset splits:")
        print(f"Training set: {len(train_df)} samples")
        print(f"Validation set: {len(val_df)} samples")
        print(f"Test set: {len(test_df)} samples")

        # Initialize tokenizer and model
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = QABertModel('bert-base-uncased')
        model.to(device)

        # Create datasets
        train_dataset = QADataset(
            train_df['Question Body'].values,
            train_df['Answer Body'].values,
            tokenizer
        )
        val_dataset = QADataset(
            val_df['Question Body'].values,
            val_df['Answer Body'].values,
            tokenizer
        )
        test_dataset = QADataset(
            test_df['Question Body'].values,
            test_df['Answer Body'].values,
            tokenizer
        )

        # Create dataloaders
        train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
        test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

        # Train the model
        train_model(model, train_dataloader, val_dataloader, device)

        # Evaluate model
        print("\nEvaluating model on test set...")
        evaluation_scores = evaluate_model(model, test_dataloader, device)

        # Save evaluation results
        save_evaluation_results(evaluation_scores)

        # Save the model
        torch.save(model.state_dict(), 'qa_model.pt')
        tokenizer.save_pretrained('qa_model')

        # Save preprocessing statistics
        with open('preprocessing_stats.txt', 'w') as f:
            f.write("Preprocessing Statistics\n")
            f.write("======================\n\n")
            f.write(f"Original dataset size: {len(df)}\n")
            f.write(f"Average question length: {df['Question Body'].str.len().mean():.2f} characters\n")
            f.write(f"Average answer length: {df['Answer Body'].str.len().mean():.2f} characters\n")
            f.write(f"\nDataset splits:\n")
            f.write(f"Training set: {len(train_df)} samples\n")
            f.write(f"Validation set: {len(val_df)} samples\n")
            f.write(f"Test set: {len(test_df)} samples\n")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Using device: cpu
Loading and preprocessing data...
Loaded DataFrame columns: Index(['QScore', 'Post Link', 'Title', 'Tags', 'Question Body', 'Questioner',
       'AScore', 'Answer Body', 'AnswerDate'],
      dtype='object')
Preprocessing questions...


100%|██████████| 20/20 [00:00<00:00, 610.11it/s]


Preprocessing answers...


100%|██████████| 20/20 [00:00<00:00, 883.78it/s]

Dataset size after preprocessing: 20 rows

Preprocessing Statistics:
Average question length: 523.75 characters
Average answer length: 741.80 characters

Dataset splits:
Training set: 14 samples
Validation set: 3 samples
Test set: 3 samples



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Epoch 1/3
----------


  0%|          | 0/2 [00:00<?, ?it/s]