In [1]:
!pip install nltk



In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import nltk

In [3]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
data = pd.read_csv("questions.csv")
N=len(data)
print('Number of question pairs: ', N)
data.head()

Number of question pairs:  404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
N_train = 300000
N_test  = 10*1024
data_train = data[:N_train]
data_test  = data[N_train:N_train+N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del(data) # remove to free memory

Train set: 300000 Test set: 10240


In [6]:
td_index = (data_train['is_duplicate'] == 1).to_numpy()
td_index = [i for i, x in enumerate(td_index) if x]
print('number of duplicate questions: ', len(td_index))
print('indexes of first ten duplicate questions:', td_index[:10])

number of duplicate questions:  111486
indexes of first ten duplicate questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]


In [7]:
print(data_train['question1'][5])  #  Example of question duplicates (first one in data)
print(data_train['question2'][5])
print('is_duplicate: ', data_train['is_duplicate'][5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?
is_duplicate:  1


In [8]:
Q1_train_words = np.array(data_train['question1'][td_index])
Q2_train_words = np.array(data_train['question2'][td_index])

Q1_test_words = np.array(data_test['question1'])
Q2_test_words = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

In [9]:
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train_words[0])
print('Question 2: ', Q2_train_words[0], '\n')
print('Question 1: ', Q1_train_words[5])
print('Question 2: ', Q2_train_words[5], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test_words[0])
print('Question 2: ', Q2_test_words[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How do I prepare for interviews for cse?
Question 2:  What is the best way to prepare for cse? 

is_duplicate = 0 



In [10]:
#create arrays
Q1_train = np.empty_like(Q1_train_words)
Q2_train = np.empty_like(Q2_train_words)

Q1_test = np.empty_like(Q1_test_words)
Q2_test = np.empty_like(Q2_test_words)

In [11]:
# Building the vocabulary with the train set         (this might take a minute)
from collections import defaultdict

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1

for idx in range(len(Q1_train_words)):
    Q1_train[idx] = nltk.word_tokenize(Q1_train_words[idx])
    Q2_train[idx] = nltk.word_tokenize(Q2_train_words[idx])
    q = Q1_train[idx] + Q2_train[idx]
    for word in q:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
print('The length of the vocabulary is: ', len(vocab))

The length of the vocabulary is:  36268


In [12]:
print(vocab['<PAD>'])
print(vocab['Astrology'])
print(vocab['Astronomy'])  #not in vocabulary, returns 0

1
2
0


In [13]:
for idx in range(len(Q1_test_words)):
    Q1_test[idx] = nltk.word_tokenize(Q1_test_words[idx])
    Q2_test[idx] = nltk.word_tokenize(Q2_test_words[idx])

In [14]:
print('Train set has reduced to: ', len(Q1_train) )
print('Test set length: ', len(Q1_test) )

Train set has reduced to:  111486
Test set length:  10240


## Converting a question to a tensor

In [15]:
# Converting questions to array of integers
for i in range(len(Q1_train)):
    Q1_train[i] = [vocab[word] for word in Q1_train[i]]
    Q2_train[i] = [vocab[word] for word in Q2_train[i]]


for i in range(len(Q1_test)):
    Q1_test[i] = [vocab[word] for word in Q1_test[i]]
    Q2_test[i] = [vocab[word] for word in Q2_test[i]]

In [16]:
print('first question in the train set:\n')
print(Q1_train_words[0], '\n')
print('encoded version:')
print(Q1_train[0],'\n')

print('first question in the test set:\n')
print(Q1_test_words[0], '\n')
print('encoded version:')
print(Q1_test[0])

first question in the train set:

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 

encoded version:
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 

first question in the test set:

How do I prepare for interviews for cse? 

encoded version:
[32, 38, 4, 107, 65, 1015, 65, 11509, 21]


In [17]:
# Splitting the data
cut_off = int(len(Q1_train)*.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off: ], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  111486
The length of the training set is:   89188
The length of the validation set is:  22298


In [18]:
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

In [36]:
from torch.utils.data import Dataset, DataLoader
import torch

class QuestionPairDataset(Dataset):
    """Dataset for question pairs using Siamese network"""
    def __init__(self, Q1, Q2):
        self.Q1 = Q1
        self.Q2 = Q2

    def __len__(self):
        return len(self.Q1)

    def __getitem__(self, idx):
        return self.Q1[idx], self.Q2[idx]

def collate_fn(batch, pad):
    """Custom collate function for padding sequences in a batch"""
    # Separate Q1 and Q2 from the batch
    Q1_batch, Q2_batch = zip(*batch)

    # Get max length in current batch
    max_len = max(
        max(len(q) for q in Q1_batch),
        max(len(q) for q in Q2_batch)
    )

    # Pad to nearest power of 2
    max_len = 2**int(np.ceil(np.log2(max_len)))

    # Pad sequences
    Q1_padded = [q + [pad] * (max_len - len(q)) for q in Q1_batch]
    Q2_padded = [q + [pad] * (max_len - len(q)) for q in Q2_batch]

    # Convert to tensors
    Q1_tensor = torch.tensor(Q1_padded)
    Q2_tensor = torch.tensor(Q2_padded)

    return Q1_tensor, Q2_tensor

def create_dataloaders(Q1_train, Q2_train, Q1_val, Q2_val, batch_size=32, pad_token=1, num_workers=4):
    """
    Create train and validation dataloaders

    Args:
        Q1_train (list): Training questions from first set
        Q2_train (list): Training questions from second set
        Q1_val (list): Validation questions from first set
        Q2_val (list): Validation questions from second set
        batch_size (int): Batch size for training
        pad_token (int): Token used for padding
        num_workers (int): Number of workers for data loading

    Returns:
        tuple: (train_loader, val_loader)
    """
    # Create datasets
    train_dataset = QuestionPairDataset(Q1_train, Q2_train)
    val_dataset = QuestionPairDataset(Q1_val, Q2_val)

    # Create dataloaders
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True,
        collate_fn=lambda x: collate_fn(x, pad=pad_token)
    )

    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True,
        collate_fn=lambda x: collate_fn(x, pad=pad_token)
    )

    return train_loader, val_loader

In [32]:
class SiameseNetwork(nn.Module):
    def __init__(self, vocab_size, d_model=128):
        super(SiameseNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.lstm = nn.LSTM(d_model, d_model, batch_first=True)

    def normalize(self, x):
        return x / torch.sqrt(torch.sum(x * x, dim=-1, keepdim=True) + 1e-8)

    def forward_one(self, x):
        # Embedding layer
        x = self.embedding(x)

        # LSTM layer
        lstm_out, _ = self.lstm(x)

        # Mean over sequence length
        x = torch.mean(lstm_out, dim=1)

        # Normalize
        x = self.normalize(x)

        return x

    def forward(self, x1, x2):
        # Process both inputs through the same network
        output1 = self.forward_one(x1)
        output2 = self.forward_one(x2)
        return output1, output2

In [21]:
class TripletLoss(nn.Module):
    def __init__(self, margin=0.25):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, v1, v2):
        # Calculate pairwise cosine similarities
        scores = torch.matmul(v1, v2.t())

        batch_size = scores.size(0)

        # Get positive pairs (diagonal elements)
        positive = torch.diagonal(scores)

        # Calculate negative scores
        negative_without_positive = scores - 2.0 * torch.eye(batch_size, device=scores.device)
        closest_negative = torch.max(negative_without_positive, dim=1)[0]

        # Calculate mean negative
        negative_zero_on_duplicate = scores * (1.0 - torch.eye(batch_size, device=scores.device))
        mean_negative = torch.sum(negative_zero_on_duplicate, dim=1) / (batch_size - 1)

        # Calculate triplet losses
        triplet_loss1 = torch.clamp(self.margin - positive + closest_negative, min=0.0)
        triplet_loss2 = torch.clamp(self.margin - positive + mean_negative, min=0.0)

        return torch.mean(triplet_loss1 + triplet_loss2)


In [22]:
!pip install rich



In [23]:
!pip install ipywidgets

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [38]:
import torch
import torch.nn as nn
import torch.optim as optim
from rich.progress import (
    Progress,
    TextColumn,
    BarColumn,
    TaskProgressColumn,
    TimeRemainingColumn,
    MofNCompleteColumn
)
from rich.console import Console
from rich.live import Live
from rich.table import Table
from datetime import datetime

def train_model(model, loss_fn, train_loader, val_loader, lr=0.01, epochs=10, device='cuda'):
    console = Console()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    model = model.to(device)
    loss_fn = loss_fn.to(device)

    # Create progress columns
    progress_columns = [
        TextColumn("[progress.description]{task.description}"),
        BarColumn(),
        MofNCompleteColumn(),
        TaskProgressColumn(),
        TimeRemainingColumn(),
    ]

    # Function to create a results table
    def create_results_table(epoch, train_loss, val_loss):
        table = Table(show_header=True, header_style="bold magenta")
        table.add_column("Metric", style="cyan")
        table.add_column("Value", justify="right", style="green")

        table.add_row(
            "Epoch",
            f"{epoch+1}/{epochs}"
        )
        table.add_row(
            "Training Loss",
            f"{train_loss:.4f}"
        )
        table.add_row(
            "Validation Loss",
            f"{val_loss:.4f}"
        )
        return table

    console.print(f"\n[bold cyan]Starting training at {datetime.now().strftime('%H:%M:%S')}[/bold cyan]\n")

    for epoch in range(epochs):
        with Progress(*progress_columns) as progress:
            # Training phase
            model.train()
            train_loss = 0
            train_task = progress.add_task(
                f"[cyan]Epoch {epoch+1}/{epochs} - Training",
                total=len(train_loader)
            )

            for batch_idx, (q1, q2) in enumerate(train_loader):
                q1, q2 = q1.to(device), q2.to(device)

                optimizer.zero_grad()
                v1, v2 = model(q1, q2)
                loss = loss_fn(v1, v2)

                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                progress.update(train_task, advance=1)

            train_loss = train_loss / len(train_loader)

            # Validation phase
            model.eval()
            val_loss = 0
            val_task = progress.add_task(
                "[yellow]Validation",
                total=len(val_loader)
            )

            with torch.no_grad():
                for q1, q2 in val_loader:
                    q1, q2 = q1.to(device), q2.to(device)
                    v1, v2 = model(q1, q2)
                    val_loss += loss_fn(v1, v2).item()
                    progress.update(val_task, advance=1)

            val_loss = val_loss / len(val_loader)

        # Print results table
        results_table = create_results_table(epoch, train_loss, val_loss)
        console.print(results_table)
        console.print("\n")

    console.print(f"[bold cyan]Training completed at {datetime.now().strftime('%H:%M:%S')}[/bold cyan]\n")

    return model

In [39]:
# 1. First, prepare your data
train_loader, val_loader = create_dataloaders(
    Q1_train=train_Q1,
    Q2_train=train_Q2,
    Q1_val=val_Q1,
    Q2_val=val_Q2,
    batch_size=256,  # Adjust based on your GPU memory
    pad_token=1,
    num_workers=4    # Adjust based on CPU cores
)

# 2. Initialize your model
vocab_size = len(vocab)  # Your vocabulary size
d_model = 128           # Embedding dimension
model = SiameseNetwork(vocab_size=vocab_size, d_model=d_model)

# 3. Initialize loss function
loss_fn = TripletLoss(margin=0.25)

# 4. Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 5. Train the model
trained_model = train_model(
    model=model,
    loss_fn=loss_fn,
    train_loader=train_loader,
    val_loader=val_loader,
    lr=0.01,
    epochs=10,
    device=device
)

# 6. Optional: Save the trained model
torch.save(trained_model.state_dict(), 'siamese_model.pth')

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

In [48]:
def classify(test_Q1, test_Q2, y, threshold, model, batch_size=64, device='cuda'):
    """
    Function to test the accuracy of the model.

    Args:
        test_Q1 (list of list): List of tokenized Q1 questions.
        test_Q2 (list of list): List of tokenized Q2 questions.
        y (list or numpy.ndarray): Array of actual target labels.
        threshold (float): Desired threshold for cosine similarity.
        model (torch.nn.Module): The trained Siamese model.
        batch_size (int, optional): Size of the batches. Defaults to 64.
        device (str, optional): Device to run the model on. Defaults to 'cuda'.

    Returns:
        float: Accuracy of the model.
    """
    model.eval()
    model = model.to(device)

    # Create test dataset and dataloader using existing create_dataloaders function
    test_loader, _ = create_dataloaders(
        Q1_train=test_Q1,
        Q2_train=test_Q2,
        Q1_val=[],  # No validation set needed for testing
        Q2_val=[],
        batch_size=batch_size,
        pad_token=1,  # Assuming 1 is the pad token as in previous code
        num_workers=0  # No additional workers needed for testing
    )

    total_correct = 0
    total_samples = len(test_Q1)

    with torch.no_grad():
        for batch_idx, (q1, q2) in enumerate(test_loader):
            q1, q2 = q1.to(device), q2.to(device)

            # Get the embeddings using forward_one method
            v1, v2 = model(q1, q2)

            # Compute cosine similarity
            cos_sim = torch.nn.functional.cosine_similarity(v1, v2)

            # Get the actual labels for this batch
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, total_samples)
            batch_labels = torch.tensor(y[start_idx:end_idx], dtype=torch.float32).to(device)

            # Apply threshold
            predictions = (cos_sim > threshold).float()

            # Compare with ground truth
            total_correct += torch.sum((predictions == batch_labels).float()).item()

    accuracy = total_correct / total_samples
    return accuracy

# Example Usage
accuracy = classify(Q1_test, Q2_test, y_test, threshold=0.7, model=trained_model, batch_size=512, device=device)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5415
