<a href="https://colab.research.google.com/github/Geethss/Precog-recruitment-task/blob/main/Sentence_Similarity_and_Bonus_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_md
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from datasets import load_dataset

nlp = spacy.load("en_core_web_lg")
dataset = load_dataset("paws-x", "en")

#extracting data from the huggingface dataset
train_features = [(example["sentence1"], example["sentence2"]) for example in dataset["train"]]
train_labels = [example["label"] for example in dataset["train"]]

dev_features = [(example["sentence1"], example["sentence2"]) for example in dataset["validation"]]
dev_labels = [example["label"] for example in dataset["validation"]]

test_features = [(example["sentence1"], example["sentence2"]) for example in dataset["test"]]
test_labels = [example["label"] for example in dataset["test"]]

#function to calculate the average similarity score
def calculate_similarity_score(sent1, sent2):
    doc1 = nlp(sent1)
    doc2 = nlp(sent2)
    weighted_avg_similarity = doc1.similarity(doc2) * len(doc1) * len(doc2)
     return weighted_avg_similarity


train_similarity_scores = [calculate_similarity_score(sent1, sent2) for sent1, sent2 in train_features]
train_sentences = [f"{sent1} {sent2}" for sent1, sent2 in train_features]

#training logistic regression model
model = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=500))
model.fit(train_sentences, train_labels)

dev_similarity_scores = [calculate_similarity_score(sent1, sent2) for sent1, sent2 in dev_features]

#Combining original sentences to make predictions on the dev set
dev_sentences = [f"{sent1} {sent2}" for sent1, sent2 in dev_features]
dev_predictions = model.predict(dev_sentences)

dev_accuracy = accuracy_score(dev_labels, dev_predictions)
print(f"Dev Accuracy: {dev_accuracy:.4f}")

test_similarity_scores = [calculate_similarity_score(sent1, sent2) for sent1, sent2 in test_features]

#Combining original sentences to make predictions on the test set
test_sentences = [f"{sent1} {sent2}" for sent1, sent2 in test_features]
test_predictions = model.predict(test_sentences)

#printing the final result
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy:.4f}")


2024-01-02 08:57:48.551248: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-02 08:57:48.551306: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-02 08:57:48.553190: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now lo

Bonus Task

In [None]:
!pip install transformers datasets

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
from datasets import load_dataset

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

dataset = load_dataset("paws-x", "en")

class SentenceSimilarityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pair = self.data[idx]
        encoding = self.tokenizer(pair["sentence1"], pair["sentence2"], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(pair["label"])
        }

train_dataset = SentenceSimilarityDataset(dataset["train"], tokenizer)
dev_dataset = SentenceSimilarityDataset(dataset["validation"], tokenizer)
test_dataset = SentenceSimilarityDataset(dataset["test"], tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

#Training configuration
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

#Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    #Validation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dev_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    dev_accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch + 1}/{num_epochs} - Dev Accuracy: {dev_accuracy:.4f}")

#Testing
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Dev Accuracy: 0.8980
Epoch 2/3 - Dev Accuracy: 0.9145
Epoch 3/3 - Dev Accuracy: 0.9170
Test Accuracy: 0.9275


In [1]:
!pip install transformers datasets

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.nn import functional as F
from sklearn.metrics import accuracy_score
from datasets import load_dataset

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # binary classification

dataset = load_dataset("PiC/phrase_similarity")

class PhraseSimilarityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pair = self.data[idx]
        encoding = self.tokenizer(pair["phrase1"], pair["phrase2"], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")

        label = 1 if pair["label"] == "positive" else 0
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),
        }

train_dataset = PhraseSimilarityDataset(dataset["train"], tokenizer)
dev_dataset = PhraseSimilarityDataset(dataset["validation"], tokenizer)
test_dataset = PhraseSimilarityDataset(dataset["test"], tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

#Training configuration
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

#Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = F.cross_entropy(outputs.logits, labels)
        loss.backward()
        optimizer.step()

    #Validation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dev_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    dev_accuracy = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch + 1}/{num_epochs} - Dev Accuracy: {dev_accuracy:.4f}")

#Testing
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")






Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading data:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/202k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/403k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]



Epoch 1/3 - Dev Accuracy: 1.0000
Epoch 2/3 - Dev Accuracy: 1.0000
Epoch 3/3 - Dev Accuracy: 1.0000
Test Accuracy: 1.0000
