<a href="https://colab.research.google.com/github/KaiTheWizard/ParaPhraseDetectProject/blob/main/Copy_of_Welcome_to_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and Loading Libraries

In [1]:
!pip install datasets transformers scikit-learn torch pandas



In [2]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Data Loading and Preprocessing


In [3]:
# Load QQP dataset
dataset = load_dataset("glue", "qqp")

In [4]:
dataset.shape

{'train': (363846, 4), 'validation': (40430, 4), 'test': (390965, 4)}

## Displaying the first and last 5 samples of the training data

In [7]:
dataset['train'].to_pandas().head()

Unnamed: 0,question1,question2,label,idx
0,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0,0
1,How do I control my horny emotions?,How do you control your horniness?,1,1
2,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0,2
3,What can one do after MBBS?,What do i do after my MBBS ?,1,3
4,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0,4


In [8]:
dataset['train'].to_pandas().tail()

Unnamed: 0,question1,question2,label,idx
363841,How do I make money flying my drone?,How can I use a dji phantom to make money,1,363841
363842,What can you do with an economics degree?,What jobs can you get with an economics degree?,1,363842
363843,What type of current does a battery produce?,How does a generator work and produce current?,0,363843
363844,Grammar: What is difference between schedule a...,How do I understand the difference between the...,0,363844
363845,What is the easiest way to earn money using in...,How can I earn money online easily?,1,363845


## Checking for missing, null and duplicate values

In [9]:
dataset['train'].to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363846 entries, 0 to 363845
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   question1  363846 non-null  object
 1   question2  363846 non-null  object
 2   label      363846 non-null  int64 
 3   idx        363846 non-null  int32 
dtypes: int32(1), int64(1), object(2)
memory usage: 9.7+ MB


In [10]:
dataset['train'].to_pandas().isnull().sum()

question1    0
question2    0
label        0
idx          0
dtype: int64

In [11]:
dataset['train'].to_pandas().duplicated().sum()

0

## Counting Total number of unique words in the QQP data

In [12]:
from transformers import DistilBertTokenizer  # importing tokenizer

# Load tokenizer and dataset
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Collect unique tokens from all splits
unique_tokens = set()
splits = ['train', 'validation', 'test']

for split in splits:
    for example in dataset[split]:
        if example.get('question1'):
            unique_tokens.update(tokenizer.tokenize(example['question1']))
        if example.get('question2'):
            unique_tokens.update(tokenizer.tokenize(example['question2']))

# Save unique tokens to a file
with open("unique_tokens.txt", "w") as file:
    for token in sorted(unique_tokens):
        file.write(f"{token}\n")

print(f"Total unique tokens: {len(unique_tokens)}")

Total unique tokens: 27010


## Calculating the optimal no. of embedding dimensions

In [13]:
# Calculate optimal embedding dimensions
nw = len(unique_tokens)
dimensions = min(10 * np.ceil(np.log10(nw)), 300)
print(f"Total unique tokens: {nw}")
print(f"Optimal number of dimensions per token: {int(dimensions)}")

Total unique tokens: 27010
Optimal number of dimensions per token: 50


## Spliting data

In [14]:
train_df = dataset['train'].to_pandas()
validation_df = dataset['validation'].to_pandas()
test_df = dataset['test'].to_pandas()

In [15]:
print(f"Shape of training data is : {train_df.shape}")
print(f"Shape of testing data is : {test_df.shape}")
print(f"Shape of validation data is : {validation_df.shape}")

Shape of training data is : (363846, 4)
Shape of testing data is : (390965, 4)
Shape of validation data is : (40430, 4)


# Embedding Generation

In [38]:
# Models for embeddings
models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/paraphrase-MiniLM-L6-v2",
    "bert-base-uncased",
    "roberta-base",
]

In [17]:
import torch.nn.functional as F

def get_embeddings(text, model_name, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)

    embedding = outputs.last_hidden_state[:, 0, :]  # Extract CLS token embedding

    # Project to 50 dimensions if needed
    if embedding.shape[1] != 50:
        embedding = F.adaptive_avg_pool1d(embedding.unsqueeze(0), 50).squeeze(0)

    return embedding.numpy().flatten()

In [29]:
import random

# Set seed for reproducibility
random.seed(42)

# Initialize embedding data dictionary for all splits
embedding_data = {
    "train": {},
    "test": {},
    "validation": {}
}

# Loop through models
for model_name in models:
    print(f"\nProcessing embeddings for model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    for split in ['train', 'test', 'validation']:
        if split not in dataset:
            print(f"Skipping {split} split as it's not in the dataset.")
            continue

        print(f"  - Processing {split} split")

        q1_list = dataset[split]['question1']
        q2_list = dataset[split]['question2']

        # Randomly sample 1000 indices (or less if not enough data)
        sample_size = min(1000, len(q1_list))
        indices = random.sample(range(len(q1_list)), sample_size)

        sampled_q1 = [q1_list[i] for i in indices]
        sampled_q2 = [q2_list[i] for i in indices]

        # Generate embeddings for sampled questions
        q1_embeddings = np.array([get_embeddings(q, model_name, tokenizer, model) for q in sampled_q1])
        q2_embeddings = np.array([get_embeddings(q, model_name, tokenizer, model) for q in sampled_q2])

        # Save to disk
        model_name_safe = model_name.replace("/", "_")
        np.save(f"{model_name_safe}_{split}_q1.npy", q1_embeddings)
        np.save(f"{model_name_safe}_{split}_q2.npy", q2_embeddings)

        # Store in dict
        embedding_data[split][model_name] = (q1_embeddings, q2_embeddings)


Processing embeddings for model: sentence-transformers/all-MiniLM-L6-v2
  - Processing train split
  - Processing test split
  - Processing validation split

Processing embeddings for model: sentence-transformers/paraphrase-MiniLM-L6-v2


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  - Processing train split
  - Processing test split
  - Processing validation split

Processing embeddings for model: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  - Processing train split
  - Processing test split
  - Processing validation split

Processing embeddings for model: roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  - Processing train split
  - Processing test split
  - Processing validation split

Processing embeddings for model: microsoft/deberta-v3-base


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

ValueError: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

In [31]:
import random

# Set same seed for reproducibility
random.seed(42)

# Initialize label data dictionary
label_data = {
    "train": None,
    "test": None,
    "validation": None
}

# Function to sample labels
def sample_labels_from_dataset(dataset, sample_size=1000):
    sampled_labels_per_split = {}

    for split in ['train', 'test', 'validation']:
        if split not in dataset:
            print(f"Skipping {split} split as it's not in the dataset.")
            continue

        labels = dataset[split]['label']
        sample_size_actual = min(sample_size, len(labels))

        # Resample using same seed to get same indices
        indices = random.sample(range(len(labels)), sample_size_actual)
        sampled_labels = [labels[i] for i in indices]

        sampled_labels_per_split[split] = sampled_labels

    return sampled_labels_per_split

# Run it and store
label_data = sample_labels_from_dataset(dataset)

In [None]:
label_data

In [30]:
embedding_data

{'train': {'sentence-transformers/all-MiniLM-L6-v2': (array([[-0.02449382,  0.00483341, -0.14260648, ..., -0.12394938,
            0.1475454 , -0.01830261],
          [-0.03890347, -0.01542168, -0.0395979 , ...,  0.05827474,
           -0.01583578,  0.0274667 ],
          [ 0.03526154,  0.23093592,  0.16593309, ..., -0.04491486,
           -0.14556651, -0.07633272],
          ...,
          [ 0.06438376, -0.1007113 ,  0.13367853, ...,  0.00330999,
            0.1301008 , -0.00107935],
          [-0.07746263, -0.13325864, -0.01516933, ...,  0.00631686,
           -0.13136286, -0.10768058],
          [-0.00115653,  0.04987596,  0.15073262, ..., -0.036385  ,
            0.04860617, -0.00865657]], dtype=float32),
   array([[ 9.83612612e-04, -4.09793202e-03, -1.78865224e-01, ...,
           -1.29865631e-01,  1.36774704e-01, -9.39817354e-03],
          [ 3.87135595e-02, -5.84291667e-02, -1.63736448e-04, ...,
            9.72618163e-02,  6.76678121e-02, -5.06729260e-03],
          [-1.5415369

# Model Development

 ### using Base ensmble model as Decision tree

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def train_and_evaluate_from_embeddings(embedding_data, label_data):
    for model_name in embedding_data['train']:
        print(f"\n--- Evaluating model: {model_name} ---")

        # Get embeddings
        q1_train, q2_train = embedding_data['train'][model_name]
        q1_test, q2_test = embedding_data['test'][model_name]

        # Combine embeddings (concatenate q1 and q2)
        X_train = np.concatenate([q1_train, q2_train], axis=1)
        X_test = np.concatenate([q1_test, q2_test], axis=1)

        # Get labels for this split
        y_train = label_data['train']
        y_test = label_data['test']

        # Safety check in case label and embedding lengths don't match
        if len(X_train) != len(y_train) or len(X_test) != len(y_test):
            print(f"Mismatch in data and label sizes for model {model_name}. Skipping.")
            continue

        # Train classifier
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = clf.predict(X_test)
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred))


#### compute similarity instead of using embeddings directly

In [40]:
# --- More complex similarity features ---
def compute_advanced_similarity_features(q1_emb, q2_emb):
    features = []
    for q1, q2 in zip(q1_emb, q2_emb):
        cos_sim = cosine_similarity(q1.reshape(1, -1), q2.reshape(1, -1))[0][0]
        euc_dist = euclidean_distances(q1.reshape(1, -1), q2.reshape(1, -1))[0][0]
        dot_prod = np.dot(q1, q2)
        l1_dist = np.sum(np.abs(q1 - q2))
        abs_diff = np.abs(q1 - q2)
        sq_diff = (q1 - q2) ** 2

        combined = np.concatenate((
            [cos_sim, euc_dist, dot_prod, l1_dist],
            abs_diff,
            sq_diff
        ))
        features.append(combined)
    return np.array(features)


In [41]:
# --- Build feature set from all models (train split) ---
feature_list = []
for model_name in models:
    q1_emb, q2_emb = embedding_data['train'][model_name]
    features = compute_advanced_similarity_features(q1_emb, q2_emb)
    feature_list.append(features)

# Combine features from all models
X = np.hstack(feature_list)

# Use the pre-sampled aligned labels
y = label_data['train']


In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split into train/test subsets from constructed X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       120
           1       0.76      0.69      0.72        80

    accuracy                           0.79       200
   macro avg       0.78      0.77      0.78       200
weighted avg       0.79      0.79      0.79       200



In [43]:
# Save model
with open("ensemble_model.pkl", "wb") as f:
    pickle.dump(clf, f)

#### Use embeddings directly

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# --- Combine raw embeddings from all models (concatenation) ---
full_embeddings = []
for model_name in models:
    q1_emb, q2_emb = embedding_data['train'][model_name]  
    combined = np.hstack((q1_emb, q2_emb))  # Direct concatenation
    full_embeddings.append(combined)

# Stack all model embeddings into one feature vector
X = np.hstack(full_embeddings)

# Use aligned labels
y = label_data['train']

# --- Train & evaluate ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.605
              precision    recall  f1-score   support

           0       0.61      0.97      0.75       120
           1       0.57      0.05      0.09        80

    accuracy                           0.60       200
   macro avg       0.59      0.51      0.42       200
weighted avg       0.59      0.60      0.49       200



In [None]:
X.shape

In [None]:
# Save model
with open("ensemble_model_concat.pkl", "wb") as f:
    pickle.dump(clf, f)

#### using CCA

In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cross_decomposition import CCA
import numpy as np
import pickle

# --- Apply CCA to reduce redundancy between q1 and q2 embeddings across all models ---
full_embeddings_q1 = []
full_embeddings_q2 = []

for model_name in models:
    q1_emb, q2_emb = embedding_data['train'][model_name]  # ✅ Access correct split
    full_embeddings_q1.append(q1_emb)
    full_embeddings_q2.append(q2_emb)

X_q1 = np.hstack(full_embeddings_q1)
X_q2 = np.hstack(full_embeddings_q2)

# Determine safe upper bound for CCA components
max_components = min(X_q1.shape[0], X_q1.shape[1], X_q2.shape[1])
n_components = min(200, max_components)  # You can adjust the target value here
cca = CCA(n_components=n_components, max_iter=1000)  # or higher if needed
X_q1_cca, X_q2_cca = cca.fit_transform(X_q1, X_q2)


# Final feature vector
X = np.hstack((X_q1_cca, X_q2_cca))

# Use pre-sampled, aligned labels
y = label_data['train']

# --- Train & evaluate model ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model to file
with open("ensemble_model_CCA.pkl", "wb") as f:
    pickle.dump(clf, f)
    
with open("cca_transform.pkl", "wb") as f:
    pickle.dump(cca, f)


Accuracy: 0.595
              precision    recall  f1-score   support

           0       0.60      0.99      0.75       120
           1       0.00      0.00      0.00        80

    accuracy                           0.59       200
   macro avg       0.30      0.50      0.37       200
weighted avg       0.36      0.59      0.45       200



In [None]:
X_q1_cca.shape


In [None]:
# Save the transformed embeddings as NumPy files
np.save("X_q1_cca.npy", X_q1_cca)
np.save("X_q2_cca.npy", X_q2_cca)
np.save("X_cca.npy", X)  # Final combined feature vector
np.save("y_labels.npy", y)  # Labels

### using Base ensmble model as Logistic Regression

# Model Testing