In [None]:
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

try:
    from transformers import AutoTokenizer, AutoModel
    import torch
except ImportError:
    print("Hugging Face Transformers and PyTorch not installed. Please install with: pip install transformers torch")

try:
    import gensim.downloader as api
    from gensim.models import Word2Vec
except ImportError:
    print("Gensim not installed. Please install with: pip install gensim")

# Download NLTK data if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK 'punkt' tokenizer...")
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading NLTK 'stopwords' corpus...")
    nltk.download('stopwords')

# Define variables for examples
text = 'Hello, world! This is an example sentence for NLTK tokenization.'
sentences = ['The cat sat on the mat.', 'The dog ran in the park.', 'A cat and a dog are pets.', 'Cats and dogs are common household animals.']

print("Variables 'text' and 'sentences' initialized.")

Variables 'text' and 'sentences' initialized.


### 1. Whitespace Tokenization
This is the simplest form of tokenization, where text is split by whitespace characters. It's fast but can be less accurate as it doesn't handle punctuation attached to words.

### 2. Character Tokenization
This method breaks down text into individual characters. It's very granular but often results in a large number of tokens, which can make analysis complex and computationally intensive.

### 3. Subword Tokenization (using BERT tokenizer)
Subword tokenization splits words into smaller units (subwords or morphemes). This approach helps to handle out-of-vocabulary words and reduces vocabulary size while still capturing semantic meaning. BERT's WordPiece tokenizer is a common example.

### 4. Tokenization with Stop Word Removal
This technique involves first tokenizing the text and then removing common words (stop words) that carry little semantic meaning (e.g., 'the', 'is', 'a'). This can reduce noise and improve the efficiency of text analysis.

### 5. Sentence Tokenization
Sentence tokenization divides a text into a sequence of sentences. This is crucial for tasks that require understanding text at a sentence level, such as sentiment analysis or machine translation, and NLTK's `sent_tokenize` is widely used for this.

### 6. Word Tokenization (using NLTK)
Word tokenization breaks a text into individual words or punctuation marks. NLTK's `word_tokenize` is a robust method that typically separates punctuation from words, offering a more refined tokenization than simple whitespace splitting.

### 1. One-Hot Encoding
One-Hot Encoding represents words as sparse binary vectors, where each word is assigned a unique index, and its vector has a '1' at that index and '0's elsewhere. It's simple but leads to high-dimensional vectors for large vocabularies and doesn't capture semantic relationships between words.

### 2. Bag-of-Words (BoW)
Bag-of-Words represents a document as an unordered collection of words, disregarding grammar and word order. It focuses on the frequency of words in a document. While simple and effective for some tasks, it loses contextual information and can result in high-dimensional sparse vectors.

### 3. Word2Vec
Word2Vec is a predictive model that learns to represent words as dense, continuous vectors (embeddings) in a lower-dimensional space. Words with similar meanings are located closer together in this vector space, capturing semantic relationships. It uses either a Continuous Bag-of-Words (CBOW) or Skip-gram architecture.

### 4. GloVe (Conceptual / using gensim.downloader for pre-trained)
Global Vectors for Word Representation (GloVe) is an unsupervised learning algorithm for obtaining vector representations for words. It captures global co-occurrence statistics of words in a corpus. Like Word2Vec, it generates dense embeddings where semantic similarities are reflected by vector proximity. Pre-trained GloVe models are often used for efficiency.

### 5. BERT Embeddings
BERT (Bidirectional Encoder Representations from Transformers) provides contextualized word embeddings. Unlike Word2Vec or GloVe, BERT generates embeddings for a word that can vary based on its context in a sentence, making it highly effective for understanding nuanced meanings. It's a deep neural network model trained on a large corpus of text.

### 1. Whitespace Tokenization

In [None]:
print("1. Whitespace Tokenization:")
whitespace_tokens = text.split()
print(f"Original text: '{text}'")
print(f"Whitespace tokens: {whitespace_tokens}")

1. Whitespace Tokenization:
Original text: 'Hello, world! This is an example sentence for NLTK tokenization.'
Whitespace tokens: ['Hello,', 'world!', 'This', 'is', 'an', 'example', 'sentence', 'for', 'NLTK', 'tokenization.']


### 2. Character Tokenization

In [None]:
print("2. Character Tokenization:")
char_tokens = list(text)
print(f"Original text: '{text}'")
print(f"Character tokens: {char_tokens}")

2. Character Tokenization:
Original text: 'Hello, world! This is an example sentence for NLTK tokenization.'
Character tokens: ['H', 'e', 'l', 'l', 'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', '!', ' ', 'T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', 'n', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', ' ', 's', 'e', 'n', 't', 'e', 'n', 'c', 'e', ' ', 'f', 'o', 'r', ' ', 'N', 'L', 'T', 'K', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', '.']


### 3. Subword Tokenization (using BERT tokenizer)

In [None]:
try:
    from transformers import AutoTokenizer
    tokenizer_bert_subword = AutoTokenizer.from_pretrained("bert-base-uncased")
    subword_tokens = tokenizer_bert_subword.tokenize(text)
    print(f"Original text: '{text}'")
    print(f"Subword tokens: {subword_tokens}")
except (ImportError, Exception) as e:
    print(f"Could not perform subword tokenization: {e}")
    print("Skipping subword tokenization. Ensure 'transformers' library is installed.")

Original text: 'Hello, world! This is an example sentence for NLTK tokenization.'
Subword tokens: ['hello', ',', 'world', '!', 'this', 'is', 'an', 'example', 'sentence', 'for', 'nl', '##t', '##k', 'token', '##ization', '.']


### 4. Tokenization with Stop Word Removal

In [None]:
from nltk.corpus import stopwords

print("4. Tokenization with Stop Word Removal:")
stop_words = set(stopwords.words('english'))
word_tokens_with_stopwords = nltk.word_tokenize(text.lower())
filtered_tokens = [word for word in word_tokens_with_stopwords if word.isalnum() and word not in stop_words]
print(f"Original text: '{text}'")
print(f"Tokens after stop word removal: {filtered_tokens}")

4. Tokenization with Stop Word Removal:
Original text: 'Hello, world! This is an example sentence for NLTK tokenization.'
Tokens after stop word removal: ['hello', 'world', 'example', 'sentence', 'nltk', 'tokenization']


### 5. Sentence Tokenization

In [None]:
print("5. Sentence Tokenization:")
sentence_tokens = nltk.sent_tokenize(text) # Using the `text` variable for a single sentence example
# For the `sentences` list:
all_sentences_tokenized = []
for s in sentences:
    all_sentences_tokenized.extend(nltk.sent_tokenize(s))
print(f"Original text: '{text}'")
print(f"Sentence tokens (from 'text'): {sentence_tokens}")
print(f"Sentence tokens (from 'sentences' list): {all_sentences_tokenized}")

5. Sentence Tokenization:
Original text: 'Hello, world! This is an example sentence for NLTK tokenization.'
Sentence tokens (from 'text'): ['Hello, world!', 'This is an example sentence for NLTK tokenization.']
Sentence tokens (from 'sentences' list): ['The cat sat on the mat.', 'The dog ran in the park.', 'A cat and a dog are pets.', 'Cats and dogs are common household animals.']


### 6. Word Tokenization (using NLTK)

In [None]:
print("6. Word Tokenization (using NLTK):")
word_tokens_nltk = nltk.word_tokenize(text)
print(f"Original text: '{text}'")
print(f"NLTK Word tokens: {word_tokens_nltk}")

6. Word Tokenization (using NLTK):
Original text: 'Hello, world! This is an example sentence for NLTK tokenization.'
NLTK Word tokens: ['Hello', ',', 'world', '!', 'This', 'is', 'an', 'example', 'sentence', 'for', 'NLTK', 'tokenization', '.']


--- Embedding Examples ---

In [None]:
# Sample corpus for embeddings (using the `sentences` variable)
corpus = sentences
print(f"Corpus for embeddings: {corpus}")

Corpus for embeddings: ['The cat sat on the mat.', 'The dog ran in the park.', 'A cat and a dog are pets.', 'Cats and dogs are common household animals.']


### 1. One-Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

print("1. One-Hot Encoding:")
# First, tokenize the corpus to get unique words
all_words = []
for s in corpus:
    all_words.extend(nltk.word_tokenize(s.lower()))
unique_words = sorted(list(set(all_words)))

# Example for a single word using a manual mapping
word_for_onehot = 'cat'
if word_for_onehot in unique_words:
    one_hot_vector = np.zeros(len(unique_words))
    one_hot_index = unique_words.index(word_for_onehot)
    one_hot_vector[one_hot_index] = 1
    print(f"Manual one-hot encoding for '{word_for_onehot}': {one_hot_vector}")
else:
    print(f"'{word_for_onehot}' not in vocabulary for one-hot encoding.")

# Using sklearn OneHotEncoder for a more robust example
words_for_encoding = np.array(unique_words).reshape(-1, 1)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(words_for_encoding)

encoded_word_example = np.array([['dog']]).reshape(-1, 1)
if 'dog' in unique_words:
    encoded_vector = encoder.transform(encoded_word_example)
    print(f"One-hot encoding for 'dog' (using sklearn): {encoded_vector[0]}")
else:
    print("'dog' not in vocabulary for sklearn one-hot encoding.")

1. One-Hot Encoding:
Manual one-hot encoding for 'cat': [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
One-hot encoding for 'dog' (using sklearn): [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


### 2. Bag-of-Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

print("2. Bag-of-Words (BoW):")
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(corpus)
print(f"Vocabulary: {vectorizer.get_feature_names_out()}")
print(f"Bag-of-Words matrix (shape {bow_matrix.shape}):\n{bow_matrix.toarray()}")
print(f"Example sentence BoW for '{corpus[0]}': {bow_matrix.toarray()[0]}")

2. Bag-of-Words (BoW):
Vocabulary: ['and' 'animals' 'are' 'cat' 'cats' 'common' 'dog' 'dogs' 'household' 'in'
 'mat' 'on' 'park' 'pets' 'ran' 'sat' 'the']
Bag-of-Words matrix (shape (4, 17)):
[[0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 2]
 [0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 2]
 [1 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0]
 [1 1 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0]]
Example sentence BoW for 'The cat sat on the mat.': [0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 2]


### 3. Word2Vec

In [None]:
try:
    from gensim.models import Word2Vec
    print("3. Word2Vec:")
    tokenized_corpus_w2v = [nltk.word_tokenize(s.lower()) for s in corpus]

    # Train a simple Word2Vec model
    model_w2v = Word2Vec(tokenized_corpus_w2v, vector_size=100, window=5, min_count=1, workers=4)
    model_w2v.train(tokenized_corpus_w2v, total_examples=len(tokenized_corpus_w2v), epochs=10)

    word_for_w2v = 'cat'
    if word_for_w2v in model_w2v.wv:
        print(f"Word2Vec embedding for '{word_for_w2v}':\n{model_w2v.wv[word_for_w2v][:10]}...") # print first 10 dimensions
    else:
        print(f"'{word_for_w2v}' not in Word2Vec model vocabulary.")

    word_similarities = model_w2v.wv.most_similar('cat', topn=3)
    print(f"Words most similar to 'cat': {word_similarities}")

except (ImportError, Exception) as e:
    print(f"Could not train Word2Vec model: {e}")
    print("Skipping Word2Vec example. Ensure 'gensim' library is installed.")



3. Word2Vec:
Word2Vec embedding for 'cat':
[ 8.1322715e-03 -4.4573341e-03 -1.0683573e-03  1.0063648e-03
 -1.9111396e-04  1.1481774e-03  6.1138608e-03 -2.0271540e-05
 -3.2459653e-03 -1.5107286e-03]...
Words most similar to 'cat': [('mat', 0.1637783795595169), ('are', 0.1460077166557312), ('sat', 0.07480262219905853)]


### 4. GloVe (Conceptual / using gensim.downloader for pre-trained)

In [None]:
try:
    import gensim.downloader as api
    print("4. GloVe (Conceptual / using gensim.downloader for pre-trained):")
    # Leveraging existing `cat_vector` and `dog_vector` from kernel for conceptual demonstration
    if 'cat_vector' in globals() and 'dog_vector' in globals():
        print(f"Using existing `cat_vector` as a conceptual GloVe-like embedding: {cat_vector[:10]}...")
        print(f"Using existing `dog_vector` as a conceptual GloVe-like embedding: {dog_vector[:10]}...")
        print("GloVe embeddings are typically loaded from pre-trained files and provide similar vector representations.")
    else:
        print("No pre-existing 'cat_vector' or 'dog_vector' found for conceptual GloVe example.")
        print("GloVe embeddings are typically loaded from pre-trained files (e.g., .txt files) or via libraries like `gensim.downloader`.")
        print("Example using `gensim.downloader` (may require `pip install smart_open` and take time to download):")
        try:
            print("  Attempting to load small pre-trained GloVe model (glove-wiki-gigaword-50)...")
            glove_model = api.load("glove-wiki-gigaword-50")
            word_for_glove = 'dog'
            if word_for_glove in glove_model:
                print(f"  GloVe embedding for '{word_for_glove}':\n  {glove_model[word_for_glove][:10]}...")
                print(f"  Words most similar to '{word_for_glove}': {glove_model.most_similar(word_for_glove, topn=3)}")
            else:
                print(f"  '{word_for_glove}' not in GloVe model vocabulary.")
        except (ImportError, Exception) as e:
            print(f"  Could not load GloVe model from gensim.downloader: {e}")
            print("  Skipping GloVe download. Ensure 'gensim' and 'smart_open' are installed.")
except ImportError:
    print("Gensim not installed. Skipping GloVe example.")

4. GloVe (Conceptual / using gensim.downloader for pre-trained):
Using existing `cat_vector` as a conceptual GloVe-like embedding: [ 8.1322715e-03 -4.4573341e-03 -1.0683573e-03  1.0063648e-03
 -1.9111396e-04  1.1481774e-03  6.1138608e-03 -2.0271540e-05
 -3.2459653e-03 -1.5107286e-03]...
Using existing `dog_vector` as a conceptual GloVe-like embedding: [-0.00872748  0.00213016 -0.00087354 -0.00931909 -0.00942814 -0.00141072
  0.00443241  0.00370407 -0.00649869 -0.00687307]...
GloVe embeddings are typically loaded from pre-trained files and provide similar vector representations.


### 5. BERT Embeddings

In [None]:
try:
    from transformers import AutoTokenizer, AutoModel
    import torch

    print("5. BERT Embeddings:")
    tokenizer_bert_embed = AutoTokenizer.from_pretrained("bert-base-uncased")
    model_bert_embed = AutoModel.from_pretrained("bert-base-uncased")

    # Example: get embedding for a sentence
    sentence_for_bert = corpus[0] # "The cat sat on the mat."
    inputs = tokenizer_bert_embed(sentence_for_bert, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad(): # Disable gradient calculations for inference
        outputs = model_bert_embed(**inputs)

    # The last_hidden_state contains token embeddings
    token_embeddings = outputs.last_hidden_state
    print(f"BERT token embeddings shape (batch_size, sequence_length, hidden_size): {token_embeddings.shape}")
    print(f"First token ('[CLS]') embedding (first 10 dimensions): {token_embeddings[0, 0, :10].numpy()}...")

    # To get a sentence embedding, often the [CLS] token embedding or average of all tokens is used
    sentence_embedding = token_embeddings[0, 0, :].numpy() # [CLS] token embedding
    print(f"BERT sentence embedding (from [CLS] token, first 10 dimensions): {sentence_embedding[:10]}...")

except (ImportError, Exception) as e:
    print(f"Could not perform BERT embeddings: {e}")
    print("Skipping BERT embeddings example. Ensure 'transformers' and 'torch' libraries are installed.")

5. BERT Embeddings:
BERT token embeddings shape (batch_size, sequence_length, hidden_size): torch.Size([1, 9, 768])
First token ('[CLS]') embedding (first 10 dimensions): [-0.3642237  -0.05305378 -0.36732262 -0.02967339 -0.460784   -0.10106134
  0.01669817  0.59577715 -0.11770311  0.10289837]...
BERT sentence embedding (from [CLS] token, first 10 dimensions): [-0.3642237  -0.05305378 -0.36732262 -0.02967339 -0.460784   -0.10106134
  0.01669817  0.59577715 -0.11770311  0.10289837]...
