<a href="https://colab.research.google.com/github/KaiTheWizard/ParaPhraseDetectProject/blob/main/Copy_of_Welcome_to_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and Loading Libraries

In [None]:
!pip install datasets transformers scikit-learn torch pandas

In [None]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Data Loading and Preprocessing


In [None]:
# Load QQP dataset
dataset = load_dataset("glue", "qqp", split="train").to_pandas()

In [None]:
dataset.shape

## Displaying the first and last 5 samples of the training data

In [None]:
dataset.head()

In [None]:
dataset.tail()

## Checking for missing, null and duplicate values

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.duplicated().sum()

## Counting Total number of unique words in the QQP data

In [None]:
from transformers import DistilBertTokenizer  # importing tokenizer
from datasets import load_dataset

# Load tokenizer and dataset
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dataset = load_dataset("glue", "qqp")  # Make sure this is included

# Collect unique tokens from all splits
unique_tokens = set()
splits = ['train', 'validation', 'test']

for split in splits:
    for example in dataset[split]:
        if example.get('question1'):
            unique_tokens.update(tokenizer.tokenize(example['question1']))
        if example.get('question2'):
            unique_tokens.update(tokenizer.tokenize(example['question2']))

# Save unique tokens to a file
with open("unique_tokens.txt", "w") as file:
    for token in sorted(unique_tokens):
        file.write(f"{token}\n")

print(f"Total unique tokens: {len(unique_tokens)}")

Total unique tokens: 25677


## Calculating the optimal no. of embedding dimensions

In [None]:
# Calculate optimal embedding dimensions
nw = len(unique_tokens)
dimensions = min(10 * np.ceil(np.log10(nw)), 300)
print(f"Total unique tokens: {nw}")
print(f"Optimal number of dimensions per token: {int(dimensions)}")

## Spliting data

In [None]:
train_df = dataset['train'].to_pandas()
validation_df = dataset['validation'].to_pandas()
test_df = dataset['test'].to_pandas()

In [None]:
print(f"Shape of training data is : {train_df.shape}")
print(f"Shape of testing data is : {test_df.shape}")
print(f"Shape of validation data is : {validation_df.shape}")

# Embedding Generation

In [None]:
# Models for embeddings
models = [
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/paraphrase-MiniLM-L6-v2",
    "bert-base-uncased",
    "roberta-base",
    "microsoft/deberta-v3-base"
]

In [None]:
import torch.nn.functional as F

def get_embeddings(text, model_name, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)

    embedding = outputs.last_hidden_state[:, 0, :]  # Extract CLS token embedding

    # Project to 50 dimensions if needed
    if embedding.shape[1] != 50:
        embedding = F.adaptive_avg_pool1d(embedding.unsqueeze(0), 50).squeeze(0)

    return embedding.numpy().flatten()

In [None]:
# Compute embeddings
embedding_data = {}
for model_name in models:
    print(""\n)
    print(f"Processing {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    q1_embeddings = np.array([get_embeddings(q, model_name, tokenizer, model) for q in dataset['question1']]) #[:1000]
    q2_embeddings = np.array([get_embeddings(q, model_name, tokenizer, model) for q in dataset['question2']]) #[:1000]

    np.save(f"{model_name.replace('/', '_')}_q1.npy", q1_embeddings)
    np.save(f"{model_name.replace('/', '_')}_q2.npy", q2_embeddings)

    embedding_data[model_name] = (q1_embeddings, q2_embeddings)

Processing sentence-transformers/all-MiniLM-L6-v2...


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Processing sentence-transformers/paraphrase-MiniLM-L6-v2...


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Processing bert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Processing roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing microsoft/deberta-v3-base...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

# Model Development

 ### using Base ensmble model as Decision tree

In [None]:
def train_and_evaluate(X, y, model_name="Model"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    print(f"--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return clf

#### compute similarity instead of using embeddings directly

In [None]:
# --- More complex similarity features ---
def compute_advanced_similarity_features(q1_emb, q2_emb):
    features = []
    for q1, q2 in zip(q1_emb, q2_emb):
        cos_sim = cosine_similarity(q1.reshape(1, -1), q2.reshape(1, -1))[0][0]
        euc_dist = euclidean_distances(q1.reshape(1, -1), q2.reshape(1, -1))[0][0]
        dot_prod = np.dot(q1, q2)
        l1_dist = np.sum(np.abs(q1 - q2))
        abs_diff = np.abs(q1 - q2)
        sq_diff = (q1 - q2) ** 2

        combined = np.concatenate((
            [cos_sim, euc_dist, dot_prod, l1_dist],
            abs_diff,
            sq_diff
        ))
        features.append(combined)
    return np.array(features)


In [None]:
# --- Build feature set from all models ---
feature_list = []
for model_name in models:
    q1_emb, q2_emb = embedding_data[model_name]
    features = compute_advanced_similarity_features(q1_emb, q2_emb)
    feature_list.append(features)

X = np.hstack(feature_list)
y = dataset['label'][:1000]

In [None]:
# Train ensemble model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
# Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.78
              precision    recall  f1-score   support

           0       0.81      0.87      0.84       133
           1       0.70      0.60      0.65        67

    accuracy                           0.78       200
   macro avg       0.76      0.73      0.74       200
weighted avg       0.77      0.78      0.78       200



In [None]:
# Save model
with open("ensemble_model.pkl", "wb") as f:
    pickle.dump(clf, f)

#### Use embeddings directly

In [None]:
full_embeddings = []
for model_name in models:
    q1_emb, q2_emb = embedding_data[model_name]
    full_embeddings.append(np.hstack((q1_emb, q2_emb)))  # Direct concatenation

X = np.hstack(full_embeddings)  # Final feature vector (High Dimensional)

# Train Random Forest on full embeddings
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.69
              precision    recall  f1-score   support

           0       0.69      0.96      0.81       133
           1       0.67      0.15      0.24        67

    accuracy                           0.69       200
   macro avg       0.68      0.56      0.52       200
weighted avg       0.68      0.69      0.62       200



In [None]:
X.shape

(1000, 500)

In [None]:
# Save model
with open("ensemble_model_concat.pkl", "wb") as f:
    pickle.dump(clf, f)

#### using CCA

In [None]:
from sklearn.cross_decomposition import CCA

In [None]:
# Apply Canonical Correlation Analysis (CCA) to reduce redundancy
full_embeddings_q1 = []
full_embeddings_q2 = []
for model_name in models:
    q1_emb, q2_emb = embedding_data[model_name]
    full_embeddings_q1.append(q1_emb)
    full_embeddings_q2.append(q2_emb)

X_q1 = np.hstack(full_embeddings_q1)
X_q2 = np.hstack(full_embeddings_q2)

cca = CCA(n_components=250)  # Reduce to 50 dimensions (can be tuned)
X_q1_cca, X_q2_cca = cca.fit_transform(X_q1, X_q2)

X = np.hstack((X_q1_cca, X_q2_cca))  # Final feature vector after CCA
y = dataset['label'][:1000]

# Train ensemble model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model
with open("ensemble_model_CCA.pkl", "wb") as f:
    pickle.dump(clf, f)



Accuracy: 0.675
              precision    recall  f1-score   support

           0       0.67      1.00      0.80       133
           1       1.00      0.03      0.06        67

    accuracy                           0.68       200
   macro avg       0.84      0.51      0.43       200
weighted avg       0.78      0.68      0.55       200



In [None]:
X_q1_cca.shape


(1000, 50)

In [None]:
# Save the transformed embeddings as NumPy files
np.save("X_q1_cca.npy", X_q1_cca)
np.save("X_q2_cca.npy", X_q2_cca)
np.save("X_cca.npy", X)  # Final combined feature vector
np.save("y_labels.npy", y)  # Labels

### using Base ensmble model as Logistic Regression

# Model Testing