In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import collections
import re

In [2]:
text_data = ["cat", "dog", "fish", "dog", "cat"]

# To implement label encoding

In [3]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(text_data)
print(encoded_labels)

[0 1 2 1 0]


# To implement one hot encoding

In [4]:
text_data_reshaped = [[label] for label in text_data]
one_hot_encoder = OneHotEncoder()
one_hot_encoded_data = one_hot_encoder.fit_transform(text_data_reshaped)
dense_encoded_data = one_hot_encoded_data.toarray()
print(dense_encoded_data)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


# To implement BoW

In [9]:
sample_data = [
    "The quick brown fox",
    "jumps over the lazy dog",
    "The dog barks",
    "The fox runs quickly",
]

tokenized_text = [sentence.lower().split() for sentence in sample_data]

vocabulary = set(word for sentence in tokenized_text for word in sentence)

bow_matrix = []

for sentence in tokenized_text:
    bow_vector = [sentence.count(word) for word in vocabulary]
    bow_matrix.append(bow_vector)

print("Vocabulary (unique words):", vocabulary)

print("Bag of Words Matrix:")
for row in bow_matrix:
    print(row)

Vocabulary (unique words): {'runs', 'lazy', 'dog', 'fox', 'brown', 'over', 'quickly', 'quick', 'barks', 'jumps', 'the'}
Bag of Words Matrix:
[0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1]
[0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1]
[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1]
[1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]


In [1]:
from collections import Counter
import re

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    return text

def create_bag_of_words(texts):
    bag_of_words = Counter()
    
    for text in texts:
        preprocessed_text = preprocess_text(text)
        words = preprocessed_text.split()
        bag_of_words.update(words)
    
    return bag_of_words

# Example texts
texts = [
    "This is an example sentence.",
    "Another example for demonstration.",
    "Example sentences help illustrate concepts."
]

bag_of_words = create_bag_of_words(texts)

In [2]:
print(bag_of_words)

Counter({'example': 3, 'this': 1, 'is': 1, 'an': 1, 'sentence': 1, 'another': 1, 'for': 1, 'demonstration': 1, 'sentences': 1, 'help': 1, 'illustrate': 1, 'concepts': 1})


# Explore Scikit learn to implement TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example texts
texts = [
    "This is an example sentence.",
    "Another example for demonstration.",
    "Example sentences help illustrate concepts."
]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the texts using the vectorizer
tfidf_matrix = vectorizer.fit_transform(texts)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense array for easier manipulation
dense_matrix = tfidf_matrix.toarray()

# Print the feature names and TF-IDF matrix
print("Feature Names:", feature_names)
print("TF-IDF Matrix:")
print(dense_matrix)

Feature Names: ['an' 'another' 'concepts' 'demonstration' 'example' 'for' 'help'
 'illustrate' 'is' 'sentence' 'sentences' 'this']
TF-IDF Matrix:
[[0.47952794 0.         0.         0.         0.28321692 0.
  0.         0.         0.47952794 0.47952794 0.         0.47952794]
 [0.         0.54645401 0.         0.54645401 0.32274454 0.54645401
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.47952794 0.         0.28321692 0.
  0.47952794 0.47952794 0.         0.         0.47952794 0.        ]]


# To implement TF-IDF

In [5]:
import math
from collections import Counter
import re

# Example texts
texts = [
    "This is an example sentence.",
    "Another example for demonstration.",
    "Example sentences help illustrate concepts."
]

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

def compute_tf(text):
    words = preprocess_text(text).split()
    word_count = Counter(words)
    total_words = len(words)
    
    tf_values = {word: count / total_words for word, count in word_count.items()}
    return tf_values

def compute_idf(texts):
    num_documents = len(texts)
    word_to_document_count = Counter()
    
    for text in texts:
        words = set(preprocess_text(text).split())
        word_to_document_count.update(words)
    
    idf_values = {word: math.log(num_documents / (count + 1)) for word, count in word_to_document_count.items()}
    return idf_values

# Compute TF and IDF values
tf_values = [compute_tf(text) for text in texts]
idf_values = compute_idf(texts)

# Create a sorted list of unique words from both TF and IDF dictionaries
unique_words = sorted(set(word for tf_dict in tf_values for word in tf_dict) | set(idf_values))

# Compute TF-IDF matrix
tfidf_matrix = []

for tf in tf_values:
    tfidf_vector = [tf.get(word, 0) * idf_values.get(word, 0) for word in unique_words]
    tfidf_matrix.append(tfidf_vector)

# Print TF-IDF matrix
for row in tfidf_matrix:
    print(row)


[0.08109302162163289, 0.0, 0.0, 0.0, -0.05753641449035618, 0.0, 0.0, 0.0, 0.08109302162163289, 0.08109302162163289, 0.0, 0.08109302162163289]
[0.0, 0.1013662770270411, 0.0, 0.1013662770270411, -0.07192051811294523, 0.1013662770270411, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.08109302162163289, 0.0, -0.05753641449035618, 0.0, 0.08109302162163289, 0.08109302162163289, 0.0, 0.0, 0.08109302162163289, 0.0]
