# **Natural Language Processing Assignment 1**

In [None]:
!pip install datasets spacy
!python -m spacy download en_core_web_sm
from datasets import load_dataset
ds = load_dataset("szhuggingface/ag_news")

#cite
'''
@inproceedings{zhang2015character,
  title={Character-level convolutional networks for text classification},
  author={Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
  booktitle={Advances in neural information processing systems},
  pages={649--657},
  year={2015}
}
'''

In [None]:
import spacy
from datasets import load_dataset
import warnings

warnings.filterwarnings("ignore")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
print("Model loaded.")

# Load the dataset
print("Loading AG News dataset...")
ds = load_dataset("ag_news")
print("Dataset loaded.")

print("\nOriginal dataset structure:")
print(ds)
print("\nOriginal sample text:")
print(ds['train'][0]['text'])

# **Text Pre-processing**

<style>
blue {
  color: skyblue;
}
</style>

## 1) **Lowercasing**

In [None]:

def spacy_process_batch(batch):
    processed_texts = []
    for doc in nlp.pipe(batch['text']):
        lower_tokens = [token.lower_ for token in doc]
        processed_texts.append(" ".join(lower_tokens))

    batch['text'] = processed_texts
    return batch

In [None]:
lowercased_ds = ds.map(spacy_process_batch, batched=True)
print("Processing complete.")

In [None]:
print("\n--- ORIGINAL TEXT (Sample 0) ---")
print(ds['train'][0]['text'])

print("\n--- PROCESSED TEXT (Sample 0) ---")
print(lowercased_ds['train'][0]['text'])

print("\n--- ORIGINAL TEXT (Sample 1) ---")
print(ds['train'][1]['text'])

print("\n--- PROCESSED TEXT (Sample 1) ---")
print(lowercased_ds['train'][1]['text'])

<style>
blue {
  color: skyblue;
}
</style>
## 2) **Removal of URLs**

Removing them first as later on with removal of special characters the structure to identify a URL would be destroyed.

In [None]:
url_pattern = re.compile(r'https?://\S+|www\.\S+')

def remove_urls_batch(batch):
    batch['text'] = [url_pattern.sub(r'', text) for text in batch['text']]
    return batch

In [None]:
nourl_ds = lowercased_ds.map(remove_urls_batch, batched=True)
print("Processing complete. New dataset is 'nourl_ds'.")

In [None]:
sample_index = -1
print("\nSearching for a sample with a URL...")
for i, doc in enumerate(ds['train']):
    if url_pattern.search(doc['text']):
        sample_index = i
        print(f"Found sample with URL at index: {sample_index}")
        break

if sample_index == -1:
    print("No URL found, using index 0 as a fallback.")
    sample_index = 0
print(f"--- Showing results for sample index: {sample_index} ---")

print("\n--- BEFORE URL REMOVAL (from 'lowercased_ds') ---")
# Get the text from the 'lowercased_ds' (it's already lowercased)
print(lowercased_ds['train'][sample_index]['text'])

print("\n--- AFTER URL REMOVAL (from 'nourl_ds') ---")
# Get the text from our new 'nourl_ds'
print(nourl_ds['train'][sample_index]['text'])

# 3) **Remove HTML tags**

In [None]:
html_pattern = re.compile(r'<[^>]+>')

def remove_html_batch(batch):
    batch['text'] = [html_pattern.sub(r' ', text) for text in batch['text']]
    return batch


In [None]:
nohtml_ds = nourl_ds.map(remove_html_batch, batched=True)

print("Processing complete. New dataset is 'nohtml_ds'.")

In [None]:
sample_index = -1
print("Searching for a sample with HTML tags in 'nourl_ds'...")

for i, doc in enumerate(nourl_ds['train']):
    # We use our compiled pattern to search the text
    if html_pattern.search(doc['text']):
        sample_index = i
        print(f"Found sample with HTML at index: {sample_index}")
        break

if sample_index == -1:
    print("No HTML tags found in the first 5000 samples. Using index 4 as a known example.")
    # The AG News dataset has known HTML in sample 4
    sample_index = 4
print(f"--- Showing results for sample index: {sample_index} ---")

print("\n--- BEFORE HTML REMOVAL (from 'nourl_ds') ---")
# Get the text from the 'nourl_ds'
print(nourl_ds['train'][sample_index]['text'])

print("\n--- AFTER HTML REMOVAL (from 'nohtml_ds') ---")
# Get the text from our new 'nohtml_ds'
print(nohtml_ds['train'][sample_index]['text'])

<style>
blue {
  color: skyblue;
}
</style>
## 4) **Removing Punctuation & Special Characters**


In [None]:
import re

def remove_punctuation_batch(batch):
    processed_texts = []
    pattern = r'[^a-zA-Z]'

    for text in batch['text']:
        processed_text = re.sub(pattern, ' ', text)
        processed_text = re.sub(r'\s+', ' ', processed_text).strip()
        processed_texts.append(processed_text)

    batch['text'] = processed_texts
    return batch

In [None]:
nopunc_ds = nohtml_ds.map(remove_punctuation_batch, batched=True)
print("Processing complete.")

In [None]:

print("\n--- ORIGINAL TEXT (Sample 0) ---")
print(lowercased_ds['train'][0]['text'])

print("\n--- PROCESSED TEXT (Sample 0) ---")
print(nopunc_ds['train'][0]['text'])

print("\n--- ORIGINAL TEXT (Sample 1) ---")
print(lowercased_ds['train'][1]['text'])

print("\n--- PROCESSED TEXT (Sample 1) ---")
print(nopunc_ds['train'][1]['text'])

<style>
blue {
  color: skyblue;
}
</style>
## 5) **Stop - Words Removal**


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
stop_words = stopwords.words('english')
stop_words_set = set(stop_words)
print(f"Loaded {len(stop_words_set)} English stop words (customized).")

In [None]:
def remove_stopwords_batch(batch):
    processed_texts = []

    for text in batch['text']:
        word_tokens = text.split()
        filtered_words = [word for word in word_tokens if word not in stop_words_set]
        final_text = " ".join(filtered_words)
        processed_texts.append(final_text)

    batch['text'] = processed_texts
    return batch

In [None]:
nostop_ds = nopunc_ds.map(remove_stopwords_batch, batched=True)
print("Processing complete.")

In [None]:
sample_index = -1
for i, doc in enumerate(nopunc_ds['train']):
    words = doc['text'].split()
    # Check if any word in the document is in our stop word set
    if any(word in stop_words_set for word in words):
        sample_index = i
        break # We found one, so we stop looking
if sample_index != -1:
    print(f"Found a good sample at index: {sample_index}")

    print("\nBEFORE STOP-WORDS (from nopunc_ds)")
    # Get the "before" text
    before_text = nopunc_ds['train'][sample_index]['text']
    print(before_text)

    print("\nAFTER STOP-WORDS (from nostop_ds)")
    # Get the "after" text for the *same index*
    after_text = nostop_ds['train'][sample_index]['text']
    print(after_text)

    # Highlight the removed words
    before_words = set(before_text.split())
    after_words = set(after_text.split())
    removed_words = before_words.difference(after_words)

    print("\nWORDS REMOVED")
    print(removed_words)

else:
    print("Could not find a sample with stop words.")

start_search_index = sample_index + 1
second_sample_index = -1

print(f"Searching for next sample starting from index {start_search_index}...")

# Iterate from the next index to the end of the training set
for i in range(start_search_index, len(nopunc_ds['train'])):
    doc_text = nopunc_ds['train'][i]['text']
    words = doc_text.split()

    # Check if any word in this document is a stop word
    if any(word in stop_words_set for word in words):
        second_sample_index = i  # Found the next one!
        break # Stop the loop

# Display the results for the second sample
if second_sample_index != -1:
    print(f"\nFound a second sample at index: {second_sample_index}")

    print("\nBEFORE STOP-WORDS (from nopunc_ds)")
    before_text = nopunc_ds['train'][second_sample_index]['text']
    print(before_text)

    print("\nAFTER STOP-WORDS (from nostop_ds)")
    after_text = nostop_ds['train'][second_sample_index]['text']
    print(after_text)

    # Highlight the removed words
    before_words = set(before_text.split())
    after_words = set(after_text.split())
    removed_words = before_words.difference(after_words)

    print("\nWORDS REMOVED")
    print(removed_words)

else:
    print("Could not find another sample with stop words.")

<style>
blue {
  color: skyblue;
}
</style>
## 6) **Stemming**


In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def stem_batch(batch):
  processed_texts = []
  for text in batch['text']:
        word_tokens = text.split()
        stemmed_words = [stemmer.stem(word) for word in word_tokens]
        final_text = " ".join(stemmed_words)
        processed_texts.append(final_text)

  batch['text'] = processed_texts
  return batch

In [None]:
stemmed_ds = nostop_ds.map(stem_batch, batched=True)
print("Processing complete. New dataset is 'stemmed_ds'.")

In [None]:
sample_index = -1

print("Searching for a sample affected by stemming...")

# We loop until we find a text that was changed
for i in range(len(nostop_ds['train'])):
    before_text = nostop_ds['train'][i]['text']
    after_text = stemmed_ds['train'][i]['text']

    if before_text != after_text:
        sample_index = i  # We found one!
        break # Stop the loop

# Now we display the sample we found
if sample_index != -1:
    print(f"--- Found a good sample at index: {sample_index} ---")

    print("\n--- BEFORE STEMMING (from nostop_ds) ---")
    before_text = nostop_ds['train'][sample_index]['text']
    print(before_text)

    print("\n--- AFTER STEMMING (from stemmed_ds) ---")
    after_text = stemmed_ds['train'][sample_index]['text']
    print(after_text)

    # Highlight the changed words
    before_words = before_text.split()
    after_words = after_text.split()

    print("\n--- WORDS CHANGED ---")
    changed_words = []
    for b, a in zip(before_words, after_words):
        if b != a:
            changed_words.append(f"'{b}' -> '{a}'")

    print(", ".join(changed_words))

else:
    print("Could not find a sample that was changed by stemming.")

<style>
blue {
  color: skyblue;
}
</style>
## 7) **Lemmatization**


In [None]:
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # Default to noun if no match
        return wordnet.NOUN

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_with_pos_batch(batch):
   processed_texts = []

   for text in batch['text']:
        word_tokens = text.split()
        pos_tagged_tokens = nltk.pos_tag(word_tokens)
        lemmatized_words = []
        for word, tag in pos_tagged_tokens:
          wordnet_tag = get_wordnet_pos(tag)
          lemma = lemmatizer.lemmatize(word, pos=wordnet_tag)
          lemmatized_words.append(lemma)
        final_text = " ".join(lemmatized_words)
        processed_texts.append(final_text)
   batch['text'] = processed_texts
   return batch

In [None]:
lemmatized_ds = nostop_ds.map(lemmatize_with_pos_batch, batched=True)

print("Processing complete. New dataset is 'lemmatized_ds_pos'.")

In [None]:
try:
    print(f"--- Showing results for sample index: {sample_index} ---")
except NameError:
    print("Could not find 'sample_index', using index 1 instead.")
    sample_index = 1 # Fallback to a known good sample

print("\n--- ORIGINAL (from nostop_ds) ---")
print(nostop_ds['train'][sample_index]['text'])

print("\n--- STEMMED (from stemmed_ds) ---")
print(stemmed_ds['train'][sample_index]['text'])

print("\n--- LEMMATIZED (from lemmatized_ds) ---")
print(lemmatized_ds['train'][sample_index]['text'])


<style>
blue {
  color: skyblue;
}
</style>
## 8) **Tokenization**



In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import itertools

In [None]:
print("Preparing final corpus for tokenization...")
train_texts = list(lemmatized_ds['train']['text'])
test_texts = list(lemmatized_ds['test']['text'])
corpus = train_texts + test_texts
print(f"Corpus prepared with {len(corpus)} total documents.")

In [None]:
VOCAB_SIZE = 20000 # Top 20,000 words
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
print("Fitting tokenizer...")
tokenizer.fit_on_texts(corpus)
print("Fitting complete.")

In [None]:
word_index = tokenizer.word_index
print("\n--- Word Index Sample (Top 10) ---")
print(dict(itertools.islice(word_index.items(), 10)))

In [None]:
print("\n--- Final 'Text-to-Sequence' Result ---")
sample_text = corpus[1]
sequence = tokenizer.texts_to_sequences([sample_text])

print(f"ORIGINAL TEXT:\n{sample_text}")
print(f"\nNUMERICAL SEQUENCE:\n{sequence[0]}")

## **Text Processing for Natural Language Processing (NLP)**



<style>
blue {
  color: skyblue;
}
</style>

## 1) **Bag of Words (BoW)**



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

print("Preparing the final labels...")
train_labels = list(lemmatized_ds['train']['label'])
test_labels = list(lemmatized_ds['test']['label'])
labels = train_labels + test_labels
#corpus prepared in tokenization
print(f"Corpus prepared with {len(corpus)} documents.")
print(f"Labels prepared with {len(labels)} labels.")

In [None]:
vectorizer = CountVectorizer(min_df=3)
print("Fitting Bag of Words model (CountVectorizer)...")

X = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

print("Bag of Words model fitted.")
print("\n--- Shape of the Feature Matrix (X) ---")
print(f"(documents, features): {X.shape}")
print(f"This means: {X.shape[0]} documents and {X.shape[1]} unique words (features).")

print("\n--- Sample of Vocabulary (features) ---")
print(feature_names[5000:5010])

In [None]:
# Visualizing results in tabular form
X_array_small = X[0:5].toarray()
index_names = [f"Doc_{i}" for i in range(5)]
df = pd.DataFrame(data=X_array_small, columns=feature_names, index=index_names)
print("--- Bag of Words Matrix (First 5 Documents) ---")
# Display the DataFrame. It will be very wide!
# We can't display all 20,000+ columns, but here's a sample:
df.iloc[:, 2000:2010]

<style>
blue {
  color: skyblue;
}
</style>

## 2) **Term Frequency-Inverse Document Frequency (TF-IDF)**



In [None]:
final_ds = lemmatized_ds
from sklearn.feature_extraction.text import TfidfVectorizer
print(f"Corpus prepared with {len(corpus)} documents.")

In [None]:
!pip install wordcloud -q

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

print("Fitting TF-IDF model...")
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
feature_names = tfidf_vectorizer.get_feature_names_out()

print("TF-IDF model fitted.")
print(f"Shape of TF-IDF Matrix: {X_tfidf.shape}")

In [None]:
print("Generating Word Cloud... ☁️")
total_tfidf_scores = X_tfidf.sum(axis=0).A1
tfidf_freqs = dict(zip(feature_names, total_tfidf_scores))
wordcloud = WordCloud(width=800,
                      height=400,
                      background_color='white').generate_from_frequencies(tfidf_freqs)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # Hide the x and y axes
plt.title("TF-IDF Word Cloud for AG News Dataset")
plt.show()

## **Advaced Level (Optional for Basic Level)**

<style>
blue {
  color: skyblue;
}
</style>

## 3) **Encodings**


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

print("Preparing the final corpus...")


train_texts = list(final_ds['train']['text'])
test_texts = list(final_ds['test']['text'])
corpus = train_texts + test_texts

print(f"Corpus prepared with {len(corpus)} documents.")

In [None]:
bow_vectorizer = CountVectorizer(min_df=3)
X_bow = bow_vectorizer.fit_transform(corpus)
ohe_vectorizer = CountVectorizer(min_df=3, binary=True)
X_ohe = ohe_vectorizer.fit_transform(corpus)
feature_names = ohe_vectorizer.get_feature_names_out()
print("Bag of Words (BoW) and One-Hot (OHE) models fitted.")
print(f"\nShape of both matrices is identical: {X_ohe.shape}")

<style>
blue {
  color: skyblue;
}
</style>

## 4) **Word Embeddings**

Word embeddings are <blue>**dense vector representations**</blue> of words that capture their meanings by placing words with <blue>**similar meanings closer**</blue> in <blue>**vector space**</blue>. Unlike BoW or one-hot encoding, embeddings <blue>**capture relationships**</blue> between words based on context. For example, in an embedding space, words like "king" and "queen" or "apple" and "fruit" would be closer together, reflecting their <blue>**semantic similarity**</blue>. These embeddings are learned from large datasets and can be used as inputs to machine learning models for various NLP tasks. Popular techniques for generating word embeddings include <blue>**Word2Vec**</blue>, <blue>**GloVe**</blue>, and <blue>**FastText**</blue>.

<style>
blue {
  color: skyblue;
}
</style>

## 5) **Word2Vec**




In [None]:
!pip install gensim -q

In [None]:
from gensim.models import Word2Vec

print(f"Original corpus has {len(corpus)} documents.")
print("\nOriginal document (string)")
print(corpus[0])


sentences = [doc.split() for doc in corpus]

print("\nPrepared 'sentence' (list of tokens)")
print(sentences[0])
print(f"\nData is now a list of {len(sentences)} lists, ready for Word2Vec.")

In [None]:
print("Training CBOW model (sg=0)...")
# sg=0 means CBOW
cbow_model = Word2Vec(sentences,
                      vector_size=100,
                      window=5,
                      min_count=3,
                      sg=0) # 0 for CBOW

print("Training Skip-gram model (sg=1)...")
# sg=1 means Skip-gram
skipgram_model = Word2Vec(sentences,
                          vector_size=100,
                          window=5,
                          min_count=3,
                          sg=1) # 1 for Skip-gram

print("Models trained successfully.")