Task 1: contrasting different tokenization approaches

In [13]:
#Import libraries
import spacy
from transformers import GPT2Tokenizer

# Load spaCy model and GPT-2 tokenizer
nlp = spacy.load("en_core_web_sm")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")



In [14]:
#sample texts
text_formal="Natural language processing is a field of artificial intelligence that focuses on the interaction between computers and humans using natural language."
text_informal="lol 😂 can’t believe how gr8 this new AI model is! #NextLevel 🚀🔥"

In [15]:
# Tokenize with spaCy
tokens_spacy_formal = [token.text for token in nlp(text_formal)]
tokens_spacy_informal = [token.text for token in nlp(text_informal)]

print(tokens_spacy_formal)
print(tokens_spacy_informal)

['Natural', 'language', 'processing', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.']
['lol', '😂', 'ca', 'n’t', 'believe', 'how', 'gr8', 'this', 'new', 'AI', 'model', 'is', '!', '#', 'NextLevel', '🚀', '🔥']


In [17]:
# Tokenize with GPT-2
tokens_gpt2_formal = tokenizer.tokenize(text_formal)
tokens_gpt2_informal = tokenizer.tokenize(text_informal)

print(tokens_gpt2_formal)
print(tokens_gpt2_informal)

['Natural', 'Ġlanguage', 'Ġprocessing', 'Ġis', 'Ġa', 'Ġfield', 'Ġof', 'Ġartificial', 'Ġintelligence', 'Ġthat', 'Ġfocuses', 'Ġon', 'Ġthe', 'Ġinteraction', 'Ġbetween', 'Ġcomputers', 'Ġand', 'Ġhumans', 'Ġusing', 'Ġnatural', 'Ġlanguage', '.']
['lol', 'ĠðŁĺ', 'Ĥ', 'Ġcan', 'âĢ', 'Ļ', 't', 'Ġbelieve', 'Ġhow', 'Ġgr', '8', 'Ġthis', 'Ġnew', 'ĠAI', 'Ġmodel', 'Ġis', '!', 'Ġ#', 'Next', 'Level', 'ĠðŁ', 'ļ', 'Ģ', 'ðŁ', 'Ķ', '¥']


# Observations:
## SPACY
-handle emojis,hashtags,informal words(gr8)in single token
## GPT2
-added (G)indicates spaces



-Represents emojis and hashtags using Unicode-like encodings

Splits informal or rare words (e.g., "gr8" into gr and 8)


# Quality of output
Formal text: GPT-2 creates subword tokens with prefixes (G indicating space), which may add unnecessary complexity . spaCy provides simpler, human-readable tokens.



Informal text: GPT-2 struggles with emojis and creative language, producing fragmented or encoded tokens. spaCy handles these more intuitively, keeping semantic coherence.
Both methods have strengths, but spaCy seems better suited for natural language analysis

Task 2: sentence embeddings


In [18]:
import numpy as np
from transformers import GPT2Model
from sklearn.metrics.pairwise import cosine_similarity


model = GPT2Model.from_pretrained('gpt2')



In [19]:
def sentence_similarity(sentence1, sentence2):


  # Tokenize and get embeddings for each sentence
  tokens1 = tokenizer(sentence1, return_tensors='pt')
  tokens2 = tokenizer(sentence2, return_tensors='pt')
  embeddings1 = model(**tokens1).last_hidden_state.mean(dim=1).detach().numpy()
  embeddings2 = model(**tokens2).last_hidden_state.mean(dim=1).detach().numpy()

  # Calculate cosine similarity
  similarity = cosine_similarity(embeddings1, embeddings2)[0][0]

  return similarity

In [21]:
#i will try with bert to see diffrence
from transformers import BertTokenizer, BertModel
import torch


# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def sentence_similarity_bert(sentence1, sentence2):


  # Tokenize and get embeddings for each sentence
  inputs1 = tokenizer(sentence1, return_tensors='pt', padding=True, truncation=True)
  inputs2 = tokenizer(sentence2, return_tensors='pt', padding=True, truncation=True)
  with torch.no_grad():
    outputs1 = model(**inputs1).last_hidden_state[:, 0, :].numpy()
    outputs2 = model(**inputs2).last_hidden_state[:, 0, :].numpy()

  #  cosine similarity
  similarity = cosine_similarity(outputs1, outputs2)[0][0]

  return similarity

In [23]:
 similar_pairs = [
    ("The cat sat on the mat.", "The feline rested on the rug."),
    ("She enjoys playing the piano.", "She likes to play piano."),
    ("He's a skilled programmer.", "He's a proficient coder.")
]

dissimilar_pairs = [
    ("The weather is sunny today.", "I need to buy groceries."),
    ("The dog barked loudly.", "The coffee is hot."),
    ("She's reading a book.", "He's playing basketball.")
]

# Test with GPT2 similarity function
print("GPT2 Similarity Scores:")
for pair in similar_pairs:
    score = sentence_similarity(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}': {score}")

for pair in dissimilar_pairs:
    score = sentence_similarity(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}': {score}")

# Test with BERT similarity function
print("\nBERT Similarity Scores:")
for pair in similar_pairs:
    score = sentence_similarity_bert(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}': {score}")

for pair in dissimilar_pairs:
    score = sentence_similarity_bert(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}': {score}")

GPT2 Similarity Scores:
Similarity between 'The cat sat on the mat.' and 'The feline rested on the rug.': 0.8408460021018982
Similarity between 'She enjoys playing the piano.' and 'She likes to play piano.': 0.8938599228858948
Similarity between 'He's a skilled programmer.' and 'He's a proficient coder.': 0.8806674480438232
Similarity between 'The weather is sunny today.' and 'I need to buy groceries.': 0.5917643308639526
Similarity between 'The dog barked loudly.' and 'The coffee is hot.': 0.6405841112136841
Similarity between 'She's reading a book.' and 'He's playing basketball.': 0.7756593227386475

BERT Similarity Scores:
Similarity between 'The cat sat on the mat.' and 'The feline rested on the rug.': 0.9503752589225769
Similarity between 'She enjoys playing the piano.' and 'She likes to play piano.': 0.9492284655570984
Similarity between 'He's a skilled programmer.' and 'He's a proficient coder.': 0.9658842086791992
Similarity between 'The weather is sunny today.' and 'I need to 