# Documentation and Resources

---
https://tiktokenizer.vercel.app/?model=gpt-4-1106-preview

https://www.nltk.org/

https://docs.pytorch.org/tutorials/

# Text Preprocessing Techniques

1. Lowercase Conversion

In [None]:
def lowercase_text(text):
    return text.lower()

text = "Hello World! This is an Example."
lowercase_text = lowercase_text(text)
print(lowercase_text)  # hello world! this is an example.

hello world! this is an example.


2. Stop Word Removal

In [None]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english')) #TAKE words
    return [token for token in tokens if token.lower() not in stop_words]

tokens = ['hello', 'world', 'this', 'is','The', 'an', 'example']
filtered_tokens = remove_stopwords(tokens)
print(filtered_tokens)  # ['hello', 'world', 'example']

['hello', 'world', 'example']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


3. Punctuation Removal

In [None]:
import string

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

text = "Hello, world! This is an example: with punctuation."
clean_text = remove_punctuation(text)
print(clean_text)  # "Hello world This is an example with punctuation"
#this process help to save token
#token = money

Hello world This is an example with punctuation


4. Regular Expressions (Regex)

In [None]:
# Match Simple Text

import re

text = "Python is fun" #corpus
match = re.search("Python", text)
print(match.group())  # Python

Python


In [None]:
# Match Simple Text

import re

text = "Python is fun"#corpus
match = re.search("love", text)
# Check if a match was found before calling .group()
if match:
    print(match.group())
else:
    print("Pattern not found")

Pattern not found


In [None]:
# Match Beginning and End

# ^ matches start of string, $ matches end
text = "Python is amazing"
start_match = re.search("^Python", text)
print(start_match.group())  # Python

end_match = re.search("amazing$", text)
print(end_match.group())  # amazing

Python
amazing


In [None]:
# Match Digits

text = "I have 3 apples and 35 oranges"
digits = re.findall(r"\d", text)  # r prefix creates a raw string # d refer to digit
print(digits)  # ['3', '3', '5']

text = "I have 3 apples and 35 oranges"
# \d+ matches one or more digits
numbers = re.findall(r"\d+", text)
print(numbers)  # ['3', '5']

['3', '3', '5']
['3', '35']


In [None]:
# Match Word Characters

text = "user_123 has logged in" #corpus
# \w matches alphanumeric + underscore
word_chars = re.findall(r"\w+", text) #w+ refer to words
print(word_chars)  # ['user_123', 'has', 'logged', 'in']

['user_123', 'has', 'logged', 'in']


In [None]:
# matching zero or more

text = "color colour colouur"
pattern = re.findall(r"colou?r", text)  # ? means 0 or 1 of previous character
print(pattern)  # ['color', 'colour']

['color', 'colour']


In [None]:
# Match One or More

text = "I loooove Python"
pattern = re.findall(r"lo+ve", text)  # + means 1 or more of previous character
print(pattern)  # ['loooove']

['loooove']


In [None]:
# Match Exact Number


text = "Phone numbers: 555-1234 and 555678-5678"
pattern = re.findall(r"\d{3}-\d{4}", text)  # {n} means exactly n occurrences
print(pattern)  # ['555-1234', '555-5678']

['555-1234', '678-5678']


In [None]:
# Match Exact Number


text = "Phone numbers: 555-1234 and 555678-56789"
pattern = re.findall(r"\d{3}-\d{4}", text)  # {n} means exactly n occurrences
print(pattern)  # ['555-1234', '555-5678']

['555-1234', '678-5678']


In [None]:
# Match Any of Several Characters

text = "The cat and the rat sat on the mat"
pattern = re.findall(r"[cr]at", text)  # matches 'cat' or 'rat'
print(pattern)  # ['cat', 'rat']

['cat', 'rat']


In [None]:
# Match Range of Characters

text = "a1b2c3D4E5"
letters = re.findall(r"[a-z]", text)  # lowercase letters
print(letters)  # ['a', 'b', 'c']

uppercase = re.findall(r"[A-Z]", text)  # uppercase letters
print(uppercase)  # ['D', 'E']

alphanumeric = re.findall(r"[a-zA-Z0-9]", text)  # all alphanumeric
print(alphanumeric)  # ['a', '1', 'b', '2', 'c', '3', 'D', '4', 'E', '5']

['a', 'b', 'c']
['D', 'E']
['a', '1', 'b', '2', 'c', '3', 'D', '4', 'E', '5']


In [None]:
# Email Validation

emails = ["user@example.com", "invalid@email", "name.last@domain.co.uk"]
pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

for email in emails:
    if re.match(pattern, email):
        print(f"{email} is valid")
    else:
        print(f"{email} is invalid")
# user@example.com is valid
# invalid@email is invalid
# name.last@domain.co.uk is valid

user@example.com is valid
invalid@email is invalid
name.last@domain.co.uk is valid


In [None]:
# Number Removal/Normalization
# /d is for digits
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def replace_numbers(text, replacement='NUM'):
    return re.sub(r'\d+', replacement, text)

text = "There are 123 apples and 456 oranges."
text_no_numbers = remove_numbers(text)
text_normalized = replace_numbers(text)

print(text_no_numbers)  # "There are  apples and  oranges."
print(text_normalized)  # "There are NUM apples and NUM oranges."

There are  apples and  oranges.
There are NUM apples and NUM oranges.


In [None]:
# Noise Removal
import re

def remove_noise(text):
    # Remove special characters and symbols
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove ASCII/Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    return text

text = "Special @#! characters & unicode like 你好 should be removed."
clean_text = remove_noise(text)
print(clean_text)  # "Special characters  unicode like  should be removed"

Special characters unicode like  should be removed


In [None]:
# Text Normalization with REGEX

import re

def normalize_text(text):
    # Convert to lowercase
    text = text.lower()

    # Replace URLs
    #\S means "non-whitespace character" (the opposite of \s)
    # \s = whitespace (spaces, tabs, newlines)
    # \S = any character that is NOT whitespace

    text = re.sub(r'https?://\S+|www\.\S+', '[URL]', text)

    # Replace emails
    text = re.sub(r'\S+@\S+', '[EMAIL]', text)

    # Replace phone numbers
    text = re.sub(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', '[PHONE]', text)

    # Replace multiple whitespaces with single
    text = re.sub(r'\s+', ' ', text)

    # Replace elongated words (e.g., "hellooooo" -> "hello")
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    return text.strip()

text = "Contact us at example@gmail.com or visit https://example.com or call 123-456-7890"
normalized_text = normalize_text(text)
print(normalized_text)  # "contact us at [EMAIL] or visit [URL] or call [PHONE]"

contact us at [EMAIL] or visit [URL] or call [PHONE]


5. Tokenization

Tokenization is the process of splitting text into smaller pieces, called tokens.
These tokens can be:

Words → Word-level tokenization

Characters → Character-level tokenization

Subwords → Subword-level tokenization (used in models like BERT, GPT)

In [None]:
# Using NLTK
import nltk
# This line was already present, but might not have downloaded punkt_tab
# nltk.download('punkt')
# Download the specific missing resource
#nltk.download('punkt_tab')

def tokenize_text(text):
    # Word tokenization
    word_tokens = nltk.word_tokenize(text)
    # Sentence tokenization
    sentence_tokens = nltk.sent_tokenize(text)
    return word_tokens, sentence_tokens

text = "Hello world. How are you today?"
word_tokens, sentence_tokens = tokenize_text(text)
print(word_tokens)  # ['Hello', 'world', '.', 'How', 'are', 'you', 'today', '?']
print(sentence_tokens)  # ['Hello world.', 'How are you today?']


# we can look at letters as well

['Hello', 'world', '.', 'How', 'are', 'you', 'today', '?']
['Hello world.', 'How are you today?']


Bert tokenizer

In [None]:
#pip install transformers


In [None]:
from transformers import AutoTokenizer
# Loads the tokenizer for the BERT model, specifically the uncased version (which means it converts all text to lowercase and ignores case differences).
# This model is widely used by developers for processing English text.
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Tokenizes the sentence "Hello world. How are you today?" into smaller units (tokens) that are understandable by the model.
tokens = tokenizer.tokenize("Hello world. How are you today?")
print(tokens)


['hello', 'world', '.', 'how', 'are', 'you', 'today', '?']


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokens = tokenizer.tokenize("Hello world. How are you today?")
print(tokens)

# The Ġ symbol represents a space before the word (tokenized using byte-level BPE).

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

['Hello', 'Ġworld', '.', 'ĠHow', 'Ġare', 'Ġyou', 'Ġtoday', '?']


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokens = tokenizer.tokenize("unbelievability")
print(tokens)
tokens = tokenizer.tokenize("banana ")
print(tokens)


['un', 'bel', 'iev', 'ability']
['ban', 'ana', 'Ġ']


6. Stemming

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

def stem_words(tokens):
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    snowball = SnowballStemmer('english')

    porter_stems = [porter.stem(token) for token in tokens]
    lancaster_stems = [lancaster.stem(token) for token in tokens]
    snowball_stems = [snowball.stem(token) for token in tokens]

    return porter_stems, lancaster_stems, snowball_stems

tokens = ['running', 'runs', 'ran', 'easily', 'fairly']
porter_stems, lancaster_stems, snowball_stems = stem_words(tokens)
print(f"Porter: {porter_stems}")    # ['run', 'run', 'ran', 'easili', 'fairli']
print(f"Lancaster: {lancaster_stems}")  # ['run', 'run', 'ran', 'easy', 'fair']
print(f"Snowball: {snowball_stems}")    # ['run', 'run', 'ran', 'easili', 'fair']

Porter: ['run', 'run', 'ran', 'easili', 'fairli']
Lancaster: ['run', 'run', 'ran', 'easy', 'fair']
Snowball: ['run', 'run', 'ran', 'easili', 'fair']


7. Lemmatization

In [None]:
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatize_words(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

tokens = ['running', 'runs', 'ran', 'better', 'mice']
lemmatized_tokens = lemmatize_words(tokens)
print(lemmatized_tokens)  # ['running', 'run', 'ran', 'better', 'mouse']

[nltk_data] Downloading package wordnet to /root/nltk_data...


['running', 'run', 'ran', 'better', 'mouse']


8. Spell Correction

In [None]:
#!pip install pyspellchecker

from spellchecker import SpellChecker

def correct_spelling(tokens):
    spell = SpellChecker()
    corrected = [spell.correction(token) for token in tokens]
    return corrected

tokens = ['helo', 'wrld', 'example','appble']
corrected_tokens = correct_spelling(tokens)
print(corrected_tokens)  # ['hello', 'world', 'example']

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2
['help', 'world', 'example', 'apple']


9. Text Normalization with TextBlob

In [None]:
#!pip install textblob
#!python -m textblob.download_corpora

from textblob import TextBlob

def normalize_with_textblob(text):
    blob = TextBlob(text)

    # Correct spelling
    corrected = blob.correct()

    # Get sentiment
    sentiment = blob.sentiment

    # Get noun phrases
    noun_phrases = blob.noun_phrases

    return str(corrected), sentiment, noun_phrases

text = "The quik brown fox jumpd over the lazzy dog."
# text = "guuod feeling"
corrected, sentiment, noun_phrases = normalize_with_textblob(text)

print(f"Corrected: {corrected}")
print(f"Sentiment: {sentiment}")
print(f"Noun phrases: {noun_phrases}")

# Corrected: The quick brown fox jumped over the lazy dog.
# Sentiment: Sentiment(polarity=0.0, subjectivity=0.0)      =poor way in sentinel analysis
# Noun phrases: ['brown fox jumpd', 'lazzy dog']

Corrected: The quick brown fox jumped over the lazy dog.
Sentiment: Sentiment(polarity=0.0, subjectivity=0.0)
Noun phrases: ['brown fox jumpd', 'lazzy dog']


10. Named Entity Recognition (NER)

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm
#Named Entity Recognition instead of limatizay=tion and steming
import spacy

def extract_entities(text):
    nlp = spacy.load("en_core_web_sm") #corpus or text
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

text = "Apple is looking at buying U.K. startup for $1 billion"
#text = "i am eating an Apple "

entities = extract_entities(text)
print(entities)  # [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]

[('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]


11. Text Cleaning (HTML/XML tags)


In [None]:
import re
from bs4 import BeautifulSoup

def clean_html(html_text):
    # Using BeautifulSoup
    soup = BeautifulSoup(html_text, "html.parser")
    clean_text = soup.get_text(separator=" ", strip=True)
    return clean_text

def clean_html_regex(html_text):
    # Using regex
    clean_text = re.sub(r'<.*?>', '', html_text)
    return clean_text

html = "<div><p>This is <b>sample</b> HTML text?.</p></div>"
clean_bs = clean_html(html)
clean_re = clean_html_regex(html)

print(clean_bs)  # "This is sample HTML text."
print(clean_re)  # "This is sample HTML text."

This is sample HTML text?.
This is sample HTML text?.


12. Contractions Expansion

In [None]:
 #!pip install contractions
import contractions

def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

text = "I can't do this and I won't try it."
expanded = expand_contractions(text)
print(expanded)  # "I cannot do this and I will not try it."

I cannot do this and I will not try it.


13. Text Vectorization


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def vectorize_texts(texts):
    # Bag of Words
    count_vec = CountVectorizer()
    bow = count_vec.fit_transform(texts)

    # TF-IDF
    tfidf_vec = TfidfVectorizer()
    tfidf = tfidf_vec.fit_transform(texts)

    return bow, count_vec.get_feature_names_out(), tfidf, tfidf_vec.get_feature_names_out()

texts = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
]

bow, bow_features, tfidf, tfidf_features = vectorize_texts(texts)
print("BoW features:", bow_features)
print("BoW matrix:\n", bow.toarray())
print("\nTF-IDF features:", tfidf_features)
print("TF-IDF matrix:\n", tfidf.toarray())

14. Word Embeddings


In [None]:
# pip install gensim
# pip install --upgrade --force-reinstall --no-cache-dir numpy gensim
# restart kernel
import gensim.downloader as api

def get_word_embeddings(words):
    # Load pre-trained Word2Vec embeddings
    model = api.load("word2vec-google-news-300")

    embeddings = {}
    for word in words:
            embeddings[word] = model[word]

    return embeddings

words = ["king", "queen", "man", "woman"]
embeddings = get_word_embeddings(words)

for word, vector in embeddings.items():
    print(f"{word}: vector shape {vector.shape}")

# king: vector shape (300,)
# queen: vector shape (300,)
# man: vector shape (300,)
# woman: vector shape (300,)

18. Language Detection and Translation


In [None]:
# !pip install googletrans==4.0.0rc1 -q
# !pip install langdetect -q
# restart kernel

from langdetect import detect
from googletrans import Translator

def detect_and_translate(text, target_lang='en'):
    # Detect language
    source_lang = detect(text)

    # Translate text
    translator = Translator()
    translation = translator.translate(text, src=source_lang, dest=target_lang)

    return source_lang, translation.text

text = "Bonjour le monde"
source, translation = detect_and_translate(text)
print(f"Detected language: {source}")
print(f"Translation: {translation}")

# Detected language: fr
# Translation: Hello world

19. Custom Vocabulary Creation


In [None]:
from collections import Counter

def create_vocabulary(texts, min_freq=2, max_vocab_size=10000):
    # Tokenize all texts
    all_tokens = []
    for text in texts:
        tokens = nltk.word_tokenize(text.lower())
        all_tokens.extend(tokens)

    # Count frequency
    token_counts = Counter(all_tokens)

    # Filter by frequency and vocabulary size
    vocab = {token: count for token, count in token_counts.most_common(max_vocab_size)
             if count >= min_freq}

    # Create mapping dictionaries
    token2id = {token: idx for idx, (token, _) in enumerate(vocab.items())}
    id2token = {idx: token for token, idx in token2id.items()}

    return vocab, token2id, id2token

texts = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

vocab, token2id, id2token = create_vocabulary(texts, min_freq=2)
print("Vocabulary:", vocab)
print("Token to ID mapping:", token2id)

Assignment

20. Comprehensive Preprocessing Pipeline


In [None]:
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download(['punkt', 'stopwords', 'wordnet'])

def preprocess_text(text, remove_stopwords=True, lemmatize=True):
    """
    Comprehensive text preprocessing pipeline
    """
    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove whitespace
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

text = "This is an example! The preprocessing pipeline removes punctuation, numbers (123), and stopwords."
tokens = preprocess_text(text)
print(tokens)