#### Task1

#### Import Libraries and Pretrained Model


In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from datasets import load_dataset
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
import pandas as pd
import plotly.express as px
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


#### Load Datasets

In [None]:

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

dataset = load_dataset("bookcorpus", split="train[:1%]")  

print(dataset[0])  


{'text': 'usually , he would be tearing around the living room , playing with his toys .'}


#### Tokenization

In [None]:
def encode_sentences(sentences, tokenizer, model):
    """
    Tokenizes sentences and returns sentence embeddings using a pretrained transformer model.
    """
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over tokens from last hidden layer
    embeddings = outputs.last_hidden_state.mean(dim=1)  

    return embeddings
sample_sentences = dataset['text'][:20]  
embeddings = encode_sentences(sample_sentences, tokenizer, model)
embeddings_np = embeddings.numpy()
print(f"Embedding for sentence: {sample_sentences[0]}")
print(embeddings_np[0]) 


Embedding for sentence: usually , he would be tearing around the living room , playing with his toys .
[ 2.30240554e-01  2.17634931e-01 -2.30480507e-02  3.15085292e-01
  2.91219145e-01 -3.75583172e-01 -1.76689520e-01  3.25360209e-01
 -2.34617725e-01 -3.77027169e-02 -1.33780628e-01 -1.96146011e-01
 -1.82651266e-01  2.03661680e-01 -1.52843311e-01  1.86633348e-01
  3.19304615e-01  1.92644313e-01 -1.05207711e-01  2.99984425e-01
  1.69351678e-02 -1.06273144e-01 -2.86487937e-01  1.42757997e-01
  2.25143611e-01 -1.27832353e-01 -1.50291428e-01 -9.98070315e-02
 -5.64656481e-02 -2.04160735e-01  7.02160671e-02  1.08491760e-02
  2.62546688e-01 -1.45309508e-01 -4.52080965e-02 -1.71361640e-01
 -5.42785525e-02 -1.01067863e-01 -3.12309414e-01  3.60523053e-02
 -2.09291637e-01 -1.54354811e-01 -1.27532691e-01  8.93664062e-02
  4.62390408e-02 -1.75261810e-01  3.67236674e-01 -1.76905230e-01
 -2.52659922e-03 -2.78656870e-01  3.67837995e-02  3.37552100e-01
  5.42776436e-02  1.19919397e-01 -2.39945296e-02  2.

In [None]:
# Cosine Similarity Calculation
sentence1 = "I love programming."
sentence2 = "I enjoy coding."
sentence3 = "The weather is nice today."
embeddings1 = encode_sentences([sentence1], tokenizer, model)
embeddings2 = encode_sentences([sentence2], tokenizer, model)
embeddings3 = encode_sentences([sentence3], tokenizer, model)
similarity_1_2 = cosine_similarity(embeddings1, embeddings2)
similarity_1_3 = cosine_similarity(embeddings1, embeddings3)
print(f"Cosine similarity between Sentence 1 and Sentence 2: {similarity_1_2[0][0]}")
print(f"Cosine similarity between Sentence 1 and Sentence 3: {similarity_1_3[0][0]}")


Cosine similarity between Sentence 1 and Sentence 2: 0.9201480746269226
Cosine similarity between Sentence 1 and Sentence 3: 0.731374204158783


In [None]:
# Calculation of cosine similarities between query and corpus
queries = ["I love programming.", "The weather is great.","Who is the author of this book?", "Why are you here?","Just Kidding"]
corpus =  ["I enjoy outdoor activities.", "It's sunny outside today.", "I like solving problems.","Coding is my passion."]
query_embeddings = encode_sentences(queries, tokenizer, model)
corpus_embeddings = encode_sentences(corpus, tokenizer, model)
similarities = cosine_similarity(query_embeddings, corpus_embeddings)

for i, query in enumerate(queries):
    most_similar_idx = similarities[i].argmax()
    print(f"Query: '{query}'\nMost similar sentence: '{corpus[most_similar_idx]}'\nSimilarity: {similarities[i][most_similar_idx]:.4f}\n")


Query: 'I love programming.'
Most similar sentence: 'Coding is my passion.'
Similarity: 0.8707

Query: 'The weather is great.'
Most similar sentence: 'It's sunny outside today.'
Similarity: 0.8411

Query: 'Who is the author of this book?'
Most similar sentence: 'It's sunny outside today.'
Similarity: 0.6429

Query: 'Why are you here?'
Most similar sentence: 'I like solving problems.'
Similarity: 0.7068

Query: 'Just Kidding'
Most similar sentence: 'Coding is my passion.'
Similarity: 0.6990



#### Infering Model on different DataSet

In [22]:
model_name = "distilbert-base-uncased"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def encode_sentences(sentences, tokenizer, model):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=128)
    with torch.no_grad():  
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  
    return embeddings
sts_dataset = load_dataset("glue", "stsb")

sentence_pairs = sts_dataset['test'][:10] 

embeddings_1 = encode_sentences(sentence_pairs['sentence1'], tokenizer, model)
embeddings_2 = encode_sentences(sentence_pairs['sentence2'], tokenizer, model)

cos_similarities = cosine_similarity(embeddings_1, embeddings_2)
ground_truth = sentence_pairs['label']  
cos_similarities_flat = cos_similarities.diagonal()

ground_truth_flat = ground_truth[:len(cos_similarities_flat)]
mse = mean_squared_error(ground_truth_flat, cos_similarities_flat)
print(f"Mean Squared Error on STS-B: {mse}")


Mean Squared Error on STS-B: 3.8571770870588424


#### Sentence Embedding

In [23]:
# Reduce dimensionality to 2D for visualization using PCA
print("Original shape of embeddings:", embeddings_np.shape)
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings_np)
df_embeddings = pd.DataFrame(reduced_embeddings, columns=['PC1', 'PC2'])
df_embeddings['Sentence'] = [f"Sentence {i+1}" for i in range(len(sample_sentences))]
fig = px.scatter(df_embeddings, x='PC1', y='PC2', text='Sentence', title="Sentence Embeddings - PCA Projection")
fig.show()

Original shape of embeddings: (20, 768)


#### Questions from Requirement

In [8]:
# Checking if The model is encoding input sentences into fixed-length embeddings
for i, embedding in enumerate(embeddings_np):
    print(f"Sentence {i+1} embedding shape: {embedding.shape}")


Sentence 1 embedding shape: (768,)
Sentence 2 embedding shape: (768,)
Sentence 3 embedding shape: (768,)
Sentence 4 embedding shape: (768,)
Sentence 5 embedding shape: (768,)
Sentence 6 embedding shape: (768,)
Sentence 7 embedding shape: (768,)
Sentence 8 embedding shape: (768,)
Sentence 9 embedding shape: (768,)
Sentence 10 embedding shape: (768,)
Sentence 11 embedding shape: (768,)
Sentence 12 embedding shape: (768,)
Sentence 13 embedding shape: (768,)
Sentence 14 embedding shape: (768,)
Sentence 15 embedding shape: (768,)
Sentence 16 embedding shape: (768,)
Sentence 17 embedding shape: (768,)
Sentence 18 embedding shape: (768,)
Sentence 19 embedding shape: (768,)
Sentence 20 embedding shape: (768,)


In [None]:
'''
There are several important decisions made about the model's design beyond the main transformer part. These include:

a. Pooling Strategy (Mean Pooling): After sentences pass through the transformer model, the code takes the embeddings from the last hidden state. It combines these embeddings using mean pooling, which means calculating the average of all token embeddings for a sentence. This decision is crucial for turning token-level embeddings into a fixed-size sentence embedding. By averaging, it simplifies complex sentence data into a single, manageable size without losing important information.

b. Dimensionality Reduction (PCA): The code also uses Principal Component Analysis (PCA) to reduce the sentence embeddings to two dimensions, aiding visualization. This step is separate from the transformer's design and is done to help visualize complex, high-dimensional sentence embeddings in a simpler form. PCA is a widely-used method for reducing dimensions that keeps essential information, making it easier to interpret the data visually.

c. Sentence Encoding Length (max_length): In the function called encode_sentences, there's a decision to limit sentence length to a maximum of 128 tokens. This choice affects how the tokenizer handles longer sentences, as it may cut off parts of the sentence beyond this limit. The max_length value can be adjusted depending on the specific needs of the application, allowing flexibility in how much sentence data is kept for processing. This choice helps in balancing between processing efficiency and retaining sentence details.
'''

"\nThere are a few choices made regarding the model architecture outside of the transformer backbone. These include:\na. Pooling Strategy (Mean Pooling):After passing the sentences through the transformer model, the code extracts the embeddings from the last hidden state.\nThe embeddings are then aggregated using mean pooling over the tokens (i.e., taking the average of all token embeddings for a given sentence).\nThis is an important choice for how to convert the token-level embeddings into a fixed-size sentence embedding. \nb. Dimensionality Reduction (PCA): The code also applies Principal Component Analysis (PCA) to reduce the dimensionality of the sentence embeddings to 2D \nfor visualization purposes. This choice of dimensionality reduction technique is independent of the transformer model's architecture and is done to visualize \nthe high-dimensional sentence embeddings. PCA is a common technique for dimensionality reduction that projects high-dimensional data into a lower-dimens

#### Task2

##### Task A

In [None]:
dataset = load_dataset("imdb")
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()
train_df = train_df.rename(columns={"label": "target", "text": "text"})
test_df = test_df.rename(columns={"label": "target", "text": "text"})
train_df["target"] = train_df["target"].map({0: 0, 1: 1})
test_df["target"] = test_df["target"].map({0: 0, 1: 1})
print(train_df.head())


                                                text  target
0  I rented I AM CURIOUS-YELLOW from my video sto...       0
1  "I Am Curious: Yellow" is a risible and preten...       0
2  If only to avoid making this type of film in t...       0
3  This film was probably inspired by Godard's Ma...       0
4  Oh, brother...after hearing about this ridicul...       0


In [None]:

class MultiTaskSentenceTransformer(nn.Module):
    def __init__(self, base_model, num_classes=2):
        super(MultiTaskSentenceTransformer, self).__init__()
        self.encoder = base_model  # DistilBERT
        self.classifier = nn.Linear(base_model.config.hidden_size, num_classes)  

    def forward(self, input_ids, attention_mask, task="embedding"):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embedding = outputs.last_hidden_state.mean(dim=1)  #

        if task == "classification":
            logits = self.classifier(sentence_embedding)  
            return logits
        elif task == "embedding":
            return sentence_embedding  

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
base_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

model = MultiTaskSentenceTransformer(base_model)


In [None]:

class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding="max_length", truncation=True, 
                                  max_length=self.max_length, return_tensors="pt")
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return input_ids, attention_mask, label
 
train_dataset = IMDBDataset(train_df["text"].tolist(), train_df["target"].tolist(), tokenizer)
test_dataset = IMDBDataset(test_df["text"].tolist(), test_df["target"].tolist(), tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [None]:
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


MultiTaskSentenceTransformer(
  (encoder): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)
scaler = GradScaler()
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

# Training Loop
num_epochs = 3
gradient_accumulation_steps = 2  
for epoch in range(num_epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for step, (input_ids, attention_mask, labels) in enumerate(train_dataloader):
        input_ids, attention_mask, labels = input_ids.to(device, non_blocking=True), \
                                            attention_mask.to(device, non_blocking=True), \
                                            labels.to(device, non_blocking=True)

        optimizer.zero_grad()
        with autocast():  
            logits = model(input_ids, attention_mask, task="classification")
            loss = criterion(logits, labels) / gradient_accumulation_steps  
        scaler.scale(loss).backward()

        if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
            scaler.step(optimizer)
            scaler.update()
        with torch.no_grad():
            _, preds = torch.max(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        total_loss += loss.item()

    accuracy = correct / total
    print(f"Epoch {epoch+1}: Loss={total_loss:.4f}, Accuracy={accuracy:.4f}")
torch.cuda.empty_cache()


Using device: cuda



`torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.


`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.



Epoch 1: Loss=122.1516, Accuracy=0.8638
Epoch 2: Loss=83.5314, Accuracy=0.9150
Epoch 3: Loss=63.6314, Accuracy=0.9400


In [None]:
def evaluate_model(model, dataloader):
    model.eval()
    all_targets = []
    all_predictions = []

    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            logits = model(input_ids, attention_mask, task="classification")
            _, preds = torch.max(logits, dim=1)

            all_targets.extend(labels.cpu().numpy())
            all_predictions.extend(preds.cpu().numpy())

    accuracy = accuracy_score(all_targets, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(all_targets, all_predictions, average="weighted")

    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = evaluate_model(model, test_dataloader)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9065
Precision: 0.9074
Recall: 0.9065
F1-Score: 0.9065


In [None]:
def predict_sentiment(model, sentences):
    model.eval()
    inputs = tokenizer(sentences, padding=True, truncation=True, max_length=256, return_tensors="pt")
    input_ids, attention_mask = inputs["input_ids"].to(device), inputs["attention_mask"].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask, task="classification")
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
    
    sentiment_labels = {0: "Negative", 1: "Positive"}
    for sentence, pred in zip(sentences, predictions):
        print(f"Sentence: '{sentence}' → Sentiment: {sentiment_labels[pred]}")


test_sentences = [
    "This movie was absolutely fantastic! I loved every second of it.",
    "I hate this film. It was the worst experience ever!",
    "The acting was decent, but the storyline was just average.",
    "The meeting was not attended by the manager.",
    "The movie is not exceptional, but can be seen a few times "
]

predict_sentiment(model, test_sentences)

Sentence: 'This movie was absolutely fantastic! I loved every second of it.' → Sentiment: Positive
Sentence: 'I hate this film. It was the worst experience ever!' → Sentiment: Negative
Sentence: 'The acting was decent, but the storyline was just average.' → Sentiment: Negative
Sentence: 'The meeting was not attended by the manager.' → Sentiment: Negative
Sentence: 'The movie is not exceptional, but can be seen a few times ' → Sentiment: Positive


#### Task B

In [None]:
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"  
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Hawking was a theoretical physicist at the University of Cambridge."

ner_results = nlp_ner(text)

print("Named Entity Recognition results:")
for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Confidence: {entity['score']:.4f}")


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Named Entity Recognition results:
Entity: Hawk, Label: I-PER, Confidence: 0.9982
Entity: ##ing, Label: I-PER, Confidence: 0.9963
Entity: University, Label: I-ORG, Confidence: 0.9775
Entity: of, Label: I-ORG, Confidence: 0.9815
Entity: Cambridge, Label: I-ORG, Confidence: 0.9907


In [None]:
from transformers import BertTokenizer, BertForTokenClassification, DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import pipeline

model_name_ner = "dbmdz/bert-large-cased-finetuned-conll03-english"  
tokenizer = BertTokenizer.from_pretrained(model_name_ner)
model_ner = BertForTokenClassification.from_pretrained(model_name_ner)

nlp_ner = pipeline("ner", model=model_ner, tokenizer=tokenizer)

model_name_sentiment = "distilbert-base-uncased-finetuned-sst-2-english"
sentiment_model = DistilBertForSequenceClassification.from_pretrained(model_name_sentiment)

sentiment_tokenizer = DistilBertTokenizer.from_pretrained(model_name_sentiment)

sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)

text = "Hawking was a theoretical physicist at the University of Cambridge. I love studying AI!"

ner_results = nlp_ner(text)

sentiment_results = sentiment_pipeline(text)

print("Named Entity Recognition results:")
for entity in ner_results:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}, Confidence: {entity['score']:.4f}")

print("\nSentiment Analysis results:")
for sentiment in sentiment_results:
    print(f"Label: {sentiment['label']}, Confidence: {sentiment['score']:.4f}")


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Device set to use cuda:0


Named Entity Recognition results:
Entity: Hawk, Label: I-PER, Confidence: 0.9977
Entity: ##ing, Label: I-PER, Confidence: 0.9946
Entity: University, Label: I-ORG, Confidence: 0.9566
Entity: of, Label: I-ORG, Confidence: 0.9778
Entity: Cambridge, Label: I-ORG, Confidence: 0.9764
Entity: AI, Label: I-MISC, Confidence: 0.9890

Sentiment Analysis results:
Label: POSITIVE, Confidence: 0.9988


In [None]:
'''
Changes Made to Architecture to support multi-task learning

Model Architecture: Designed a model called MultiTaskSentenceTransformer. This model can do two things: create sentence embeddings or provide classification results, depending on what is needed.
Dataset Class: Created a special class to prepare the IMDB dataset. This class helps break down sentences into smaller parts called tokens and makes input-output pairs for both sentence embedding and classification tasks.
Training and Evaluation: Made the training process more efficient by using mixed-precision, which speeds it up. Also added a function to check how well the model is doing by measuring its performance. This model design lets the model handle multiple tasks at once, such as creating sentence embeddings and classifying data, using just one model.
Training Pipeline: Enhanced the training loop to deal with different tasks and their specific loss calculations.
Evaluation Pipeline: Introduced functions to the evaluation process that calculate important performance metrics based on the task being performed.

'''

'\nChanges Made to Architecture to support multi-task learning\n\nModel Architecture: Designed a model called MultiTaskSentenceTransformer. This model can do two things: create sentence embeddings or provide classification results, depending on what is needed.\n\nDataset Class: Created a special class to prepare the IMDB dataset. This class helps break down sentences into smaller parts called tokens and makes input-output pairs for both sentence embedding and classification tasks.\n\nTraining and Evaluation: Made the training process more efficient by using mixed-precision, which speeds it up. Also added a function to check how well the model is doing by measuring its performance. This model design lets the model handle multiple tasks at once, such as creating sentence embeddings and classifying data, using just one model.\n\nTraining Pipeline: Enhanced the training loop to deal with different tasks and their specific loss calculations.\n\nEvaluation Pipeline: Introduced functions to t