# NLP fast tutorial

In [None]:
import nltk
import warnings
warnings.filterwarnings('ignore')
nltk.download('all')

Tokenization

In [None]:
from nltk import word_tokenize, sent_tokenize
from transformers import AutoTokenizer

# production approach for large datasets
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

test = "Beach-combing replaced wine tasting as his new obsession. \
That is an appealing treasure map that I can't read."

print("Word: ", word_tokenize(test))
print("Sentence: ", sent_tokenize(test))
print("Tokenized: ", tokenizer.tokenize(test), "\n")

Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("plays", 'v'))
print(lemmatizer.lemmatize("played", 'v'))
print(lemmatizer.lemmatize("play", 'v'))
print(lemmatizer.lemmatize("playing", 'v'))

Part of Speech

In [None]:
from nltk import pos_tag
from nltk import word_tokenize

text = "I love ChatGPT"
tokenized_text = word_tokenize(text)
tags = tokens_tag = pos_tag(tokenized_text)
tags

Named Entity Recognition

In [None]:
from nltk import word_tokenize, pos_tag, ne_chunk

text = "Barack Obama was born in Hawaii in 1961."

# Tokenize and POS tag the sentence
tokens = word_tokenize(text)
tags = pos_tag(tokens)

# Apply Named Entity Recognition
entities = ne_chunk(tags)
print(entities)

Stop World Removal

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "This is an example sentence demonstrating the removal of stop words."
tokens = word_tokenize(text)

stop_words = set(stopwords.words('english'))

filtered_words = [word for word in tokens if word.lower() not in stop_words]
print(tokens)
print(filtered_words)

Dependency Parsing

In [None]:
import spacy
from spacy import cli

cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

text = "The quick brown fox jumps over the lazy dog."
doc = nlp(text)

print(f"{'Word':<12} {'Dependency':<12} {'Head Word':<12}")
print("-" * 40)

for token in doc:
    # token.text: The word itself
    # token.dep_: The relationship (e.g., nsubj = nominal subject)
    # token.head.text: The word this word is attached to
    print(f"{token.text:<12} {token.dep_:<12} {token.head.text:<12}")

Coreference Resolution

In [None]:
from fastcoref import FCoref

model = FCoref(device='cpu')

text = "Barack Obama was born in Hawaii. He was the 44th president."

preds = model.predict(texts=[text])

clusters = preds[0].get_clusters()
print("Coreference Clusters:", clusters)

Vector Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2').to("cuda")

# 2. Define data
sentences = [
    "The cat sits on the mat",
    "A dog is laying on the rug",  # Semantically similar to the first
    "I love eating pizza",         # Semantically different
]

# 3. Generate embeddings
embeddings = model.encode(sentences)

print(f"Shape: {embeddings.shape}")  # (3, 384) -> 3 sentences, 384 dimensions each
print(f"First 5 values of sentence 1: {embeddings[0][:5]}")

RNN example

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using ", device)

Dataset

In [None]:
name_dataset = {
    'Italian': [
        'Rossi', 'Russo', 'Ferrari', 'Esposito', 'Bianchi',
        'Romano', 'Colombo', 'Ricci', 'Marino', 'Greco',
        'Bruno', 'Gallo', 'Conti', 'De Luca', 'Mancini'
    ],
    'Japanese': [
        'Sato', 'Suzuki', 'Takahashi', 'Tanaka', 'Watanabe',
        'Ito', 'Yamamoto', 'Nakamura', 'Kobayashi', 'Kato',
        'Yoshida', 'Yamada', 'Sasaki', 'Yamaguchi', 'Matsumoto'
    ],
    'English': [
        'Smith', 'Jones', 'Taylor', 'Brown', 'Williams',
        'Wilson', 'Johnson', 'Davies', 'Robinson', 'Wright',
        'Thompson', 'Evans', 'Walker', 'White', 'Roberts'
    ]
}

Sequence of letters (tokenize by character)

In [None]:
test = "Sato"

def tokenize(sentence):
    return list(sentence)

test_tokenized = tokenize(test)
print(test_tokenized)

Vector Embeddings (little overengineering)

In [None]:
embeddings = model.encode(test_tokenized, convert_to_tensor=True)
print(embeddings)
print(embeddings.shape)

Creating dataset and preprocess it

In [None]:
from sklearn.model_selection import train_test_split
import random

categories = list(name_dataset.keys())
def category_to_index(category):
    return categories.index(category)

def index_to_category(index):
    return categories[index]

data_pairs = []

for key, list_value in name_dataset.items():
    category_index = category_to_index(key)
    for value in list_value:
        data_pairs.append((tokenize(value), category_index))

random.shuffle(data_pairs)

X_raw = [model.encode(pair[0], convert_to_tensor=True) for pair in data_pairs] # The names
Y_raw = [torch.tensor([pair[1]], dtype=torch.long) for pair in data_pairs]

train_X, test_X, train_Y, test_Y = train_test_split(X_raw, Y_raw, test_size=0.2, random_state=42)

print(f"Total samples: {len(data_pairs)}")
print(f"Training samples: {len(train_X)}")
print(f"Test samples: {len(test_X)}")
print(f"Sample X shape: {train_X[0].shape}")
print(f"Sample X type: {train_X[0].dtype}")
print(f"Sample Y shape: {train_Y[0].shape}")
print(f"Sample Y type: {train_Y[0].dtype}")

RNN Model

In [None]:
import torch.nn as nn

class RNNForEmbeddings(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNForEmbeddings, self).__init__()

        self.hidden_size = hidden_size

        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, batch_first=True)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Input x is now expected to be a FloatTensor of vectors.
        # Expected Shape: [Sequence Length, Vector Dimension] (e.g., [4, 32])

        # Add the batch dimension if it's missing (make it [1, 4, 32])
        if x.dim() == 2:
            x = x.unsqueeze(0)

        # Run the RNN
        # It accepts the vectors directly
        out, hidden = self.rnn(x)

        # Pass the final hidden state to the classifier
        final_output = self.fc(hidden.squeeze(0))

        return final_output

training loop

In [None]:
import torch.optim as optim

# Hyperparameters
n_hidden = 128
n_letters = train_X[0].shape[1]
n_epochs = 1000 # How many times we show it a name
learning_rate = 0.005

# Instantiate the model
# input_size = 384 (number of letters)
# hidden_size = 128 (size of the "brain")
# output_size = 3 (Italian, Japanese, English)
rnn = RNNForEmbeddings(input_size=n_letters, hidden_size=n_hidden, output_size=len(categories))

rnn.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)

In [None]:
import time

# Helper to track loss
current_loss = 0
all_losses = []

print(f"Training on {len(train_X)} samples for {n_epochs} iterations...")

start_time = time.time()

for epoch in range(1, n_epochs + 1):
    # 1. Pick a random training example
    # We use random sampling (Stochastic Gradient Descent)
    random_index = random.randint(0, len(train_X) - 1)
    input_tensor, target_tensor = train_X[random_index], train_Y[random_index]
    input_tensor = input_tensor.clone().to(device)
    target_tensor = target_tensor.clone().to(device)

    # 2. Zero the gradients (PyTorch accumulates them by default)
    optimizer.zero_grad()

    # 3. Forward Pass: Feed the name into the RNN
    output = rnn(input_tensor)

    # 4. Calculate Loss: How wrong was it?
    # output shape: [3] (scores for It, Jap, Eng)
    # target shape: [1] (the correct index)
    loss = criterion(output, target_tensor)

    # 5. Backward Pass: Calculate gradients
    loss.backward()

    # 6. Optimizer Step: Update weights
    optimizer.step()

    # --- Logging ---
    current_loss += loss.item()

    # Print updates every 100 epochs
    if epoch % 100 == 0:
        avg_loss = current_loss / 100
        print(f"Epoch {epoch} | Loss: {avg_loss:.4f}")
        all_losses.append(avg_loss)
        current_loss = 0

print(f"Training finished in {time.time() - start_time:.2f}s")

evaluation

In [None]:
import torch
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Set the RNN (not the embedding model) to eval mode
rnn.eval()

y_true = []
y_pred = []

# 2. Iterate through test data
with torch.no_grad():
    for i in range(len(test_X)):
        inputs = test_X[i].to(device)

        label = test_Y[i]

        outputs = rnn(inputs)

        _, prediction = torch.max(outputs, 1)

        y_true.append(label.item())
        y_pred.append(prediction.item())


cm = confusion_matrix(y_true, y_pred)

# 4. Plot Heatmap
plt.figure(figsize=(8, 6))
# Using the categories names for the axis labels makes it easier to read
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=categories, yticklabels=categories)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
def predict_name(name):
    rnn.eval()
    with torch.no_grad():
        # 1. Preprocess the raw string exactly like we did for training data
        tokens = tokenize(name)

        # 2. Generate Embeddings
        embedding = model.encode(tokens, convert_to_tensor=True).clone()

        # 3. Move to Device
        embedding = embedding.to(device)

        # 4. Run Model
        output = rnn(embedding)

        # 5. Get Result
        _, top_index = output.topk(1)
        category = categories[top_index.item()]

        return category

# --- Try it out! ---
print("\n--- Live Tests ---")
print(f"Test 'Mussolini': {predict_name('Mussolini')}")
print(f"Test 'Nakamoto': {predict_name('Nakamoto')}")
print(f"Test 'Shakespeare': {predict_name('Shakespeare')}")

# Exercise:
Create a spam detector using CNN

In [None]:
from urllib.request import urlretrieve
import pandas as pd
import zipfile
import os

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
zip_path = "smsspamcollection.zip"
extract_path = "sms_data"

if not os.path.exists(extract_path):
    print("Downloading dataset...")
    urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("Download complete.")

# Load data using Pandas
# The UCI dataset is tab-separated (TSV) with no header
df = pd.read_csv(f"{extract_path}/SMSSpamCollection", sep='\t', names=['label', 'text'])

# Convert labels to numbers: spam=1, ham=0
df['label'] = df['label'].map({'spam': 1, 'ham': 0})
print(f"Data Loaded: {len(df)} messages.")
print(df.head())