# Task 1. Text Preprocessing with NLTK and spaCy

In [1]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
#NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/lazer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lazer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lazer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
sample_paragraph = """The vast Kazakh steppe stretched before them, an endless sea of grass.  
The wind whispered tales of ancient khans and nomadic warriors.  
A lone eagle soared overhead, a symbol of freedom and strength.  
The sun beat down mercilessly, but the people endured, their spirits unbroken. """

In [5]:
# NLTK Processing
nltk_tokens = word_tokenize(sample_paragraph.lower()) # Tokenization and lowercasing
nltk_lemmatizer = WordNetLemmatizer()
nltk_stopwords = set(stopwords.words('english'))

nltk_lemmatized = [nltk_lemmatizer.lemmatize(token) for token in nltk_tokens if token.isalnum()] #Lemmatization and removing punctuation/non-alphanumeric

nltk_filtered = [token for token in nltk_lemmatized if token not in nltk_stopwords] # Stopword removal

In [6]:
#spaCy Processing
spacy_doc = nlp(sample_paragraph.lower())
spacy_lemmatized = [token.lemma_ for token in spacy_doc if token.is_alpha]
spacy_filtered = [token for token in spacy_lemmatized if token not in nlp.Defaults.stop_words]  # Corrected line

In [7]:
print("NLTK Tokens:", nltk_tokens)
print("NLTK Lemmatized:", nltk_lemmatized)
print("NLTK Filtered:", nltk_filtered)

print("\nspaCy Tokens (implicit):", [token.text for token in spacy_doc]) # spaCy tokenizes as part of the nlp pipeline
print("spaCy Lemmatized:", spacy_lemmatized)
print("spaCy Filtered:", spacy_filtered)

NLTK Tokens: ['the', 'vast', 'kazakh', 'steppe', 'stretched', 'before', 'them', ',', 'an', 'endless', 'sea', 'of', 'grass', '.', 'the', 'wind', 'whispered', 'tales', 'of', 'ancient', 'khans', 'and', 'nomadic', 'warriors', '.', 'a', 'lone', 'eagle', 'soared', 'overhead', ',', 'a', 'symbol', 'of', 'freedom', 'and', 'strength', '.', 'the', 'sun', 'beat', 'down', 'mercilessly', ',', 'but', 'the', 'people', 'endured', ',', 'their', 'spirits', 'unbroken', '.']
NLTK Lemmatized: ['the', 'vast', 'kazakh', 'steppe', 'stretched', 'before', 'them', 'an', 'endless', 'sea', 'of', 'grass', 'the', 'wind', 'whispered', 'tale', 'of', 'ancient', 'khan', 'and', 'nomadic', 'warrior', 'a', 'lone', 'eagle', 'soared', 'overhead', 'a', 'symbol', 'of', 'freedom', 'and', 'strength', 'the', 'sun', 'beat', 'down', 'mercilessly', 'but', 'the', 'people', 'endured', 'their', 'spirit', 'unbroken']
NLTK Filtered: ['vast', 'kazakh', 'steppe', 'stretched', 'endless', 'sea', 'grass', 'wind', 'whispered', 'tale', 'ancient'

# Task 2. Named Entity Recognition (NER) with spaCy

In [8]:
from spacy import displacy

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
sample_text = """Beshbarmak, a traditional dish made with boiled meat and noodles, is a staple of Kazakh cuisine.  
Yurts, portable dwellings used by nomadic people, are a common sight in rural areas.  
The Baikonur Cosmodrome, from which Yuri Gagarin launched into space, is located in Kazakhstan.  
Kazakhstan gained independence in 1991.  
The capital city, Astana, is a modern metropolis. """


In [11]:
doc = nlp(sample_text)

In [12]:
# Extract Named Entities
for ent in doc.ents:
    print(ent.text, ent.label_) 

Beshbarmak ORG
Kazakh ORG
Yuri Gagarin PERSON
Kazakhstan GPE
Kazakhstan GPE
1991 DATE
Astana GPE


In [13]:
#visualize them
displacy.render(doc, style="ent", jupyter=True) 

# Task 3. Text Vectorization using Transformers


In [14]:
from transformers import AutoTokenizer, AutoModel
import torch

In [15]:
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [16]:
sentence = [
    "The Altai Mountains, with their breathtaking scenery, attract tourists from around the world."
]

In [17]:
# tokenize and encode
encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')


In [18]:
# Mean Pooling Function
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    mean_embeddings = sum_embeddings / sum_mask
    return mean_embeddings

In [19]:
# embedding get
with torch.no_grad():
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(hidden_states, dim=1)
    sentence_embedding_mean_pooled = mean_pooling(outputs, encoded_input['attention_mask'])  # Mean-pooled sentence embedding


In [20]:
print("Shape of hidden states (word embeddings):", hidden_states.shape)
print("Hidden size (embedding dimension):", model.config.hidden_size)
print("Shape of sentence embedding (averaged):", sentence_embedding.shape)
print("Shape of sentence embedding (mean-pooled):", sentence_embedding_mean_pooled.shape)


Shape of hidden states (word embeddings): torch.Size([1, 20, 768])
Hidden size (embedding dimension): 768
Shape of sentence embedding (averaged): torch.Size([1, 768])
Shape of sentence embedding (mean-pooled): torch.Size([1, 768])


In [21]:
# Example: third word embedding
third_word_embedding = hidden_states[0, 3, :]
print("Embedding of the first word:", third_word_embedding.shape)


Embedding of the first word: torch.Size([768])


# Task 4, Sentiment Analysis with Transformers

In [22]:
from transformers import  pipeline

In [23]:
classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")  

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [24]:
sentences = [
     "This movie was absolutely fantastic!",
    "The food was terrible and the service was slow.",
    "The book was okay, but I didn't love it.",
    "I had a neutral experience at the restaurant.",
    "The performance was breathtaking and emotionally moving.",
    "The actor's performance was mediocre and unconvincing.",
    "The plot was confusing and poorly written.",
    "The special effects were stunning and visually impressive.",
    "The weather today is unremarkable.",
    "The color of the walls is beige."
]

In [25]:
# Sentiment Analysis
results = classifier(sentences)


In [26]:
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Sentiment: {results[i]['label']}")
    print(f"Score: {results[i]['score']:.4f}")
    print("-" * 50)



Sentence 1: This movie was absolutely fantastic!
Sentiment: positive
Score: 0.9820
--------------------------------------------------
Sentence 2: The food was terrible and the service was slow.
Sentiment: negative
Score: 0.9555
--------------------------------------------------
Sentence 3: The book was okay, but I didn't love it.
Sentiment: negative
Score: 0.8365
--------------------------------------------------
Sentence 4: I had a neutral experience at the restaurant.
Sentiment: neutral
Score: 0.6010
--------------------------------------------------
Sentence 5: The performance was breathtaking and emotionally moving.
Sentiment: positive
Score: 0.9801
--------------------------------------------------
Sentence 6: The actor's performance was mediocre and unconvincing.
Sentiment: negative
Score: 0.9037
--------------------------------------------------
Sentence 7: The plot was confusing and poorly written.
Sentiment: negative
Score: 0.9194
----------------------------------------------