In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Ensure required resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


# Sample sentences
documents = [
    "Wrist watch looks elegant",
    "Watching TV feels immersive",
    "Toy train feels fragile",
    "Fitness band trains athletes",
    "Camera lens broke. Camera lens replaced quickly"
]


# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocess sentences: tokenization, stopword removal, and lemmatization
preprocessed_documents = []
for doc in documents:
    tokens = doc.split()
    cleaned_tokens = [
        lemmatizer.lemmatize(word.lower().strip(".?!"))
        for word in tokens
        if word.lower().strip(".?!") not in stop_words
    ]
    preprocessed_documents.append(" ".join(cleaned_tokens))

# Flatten the list of lists to get a unique vocabulary without repetitions
vocabulary = sorted(list(set(word for doc in preprocessed_documents for word in doc.split())))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:

vocabulary

['athlete',
 'band',
 'broke',
 'camera',
 'elegant',
 'feel',
 'fitness',
 'fragile',
 'immersive',
 'lens',
 'look',
 'quickly',
 'replaced',
 'toy',
 'train',
 'tv',
 'watch',
 'watching',
 'wrist']

In [None]:
# One-hot encoding step retained below
# Create a mapping of words to indices
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# Create the one-hot encoding matrix for sentences
sentence_one_hot_matrix = []
for doc in preprocessed_documents:
    one_hot_vector = [0] * len(vocabulary)
    for word in doc.split():
        if word in word_to_index:
            one_hot_vector[word_to_index[word]] = 1
    sentence_one_hot_matrix.append(one_hot_vector)

# Convert the one-hot encoding to a DataFrame with sentences as rows and words as columns
df_one_hot = pd.DataFrame(sentence_one_hot_matrix, columns=vocabulary, index=documents).reset_index()
df_one_hot

Unnamed: 0,index,athlete,band,broke,camera,elegant,feel,fitness,fragile,immersive,lens,look,quickly,replaced,toy,train,tv,watch,watching,wrist
0,Wrist watch looks elegant,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1
1,Watching TV feels immersive,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0
2,Toy train feels fragile,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0
3,Fitness band trains athletes,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
4,Camera lens broke. Camera lens replaced quickly,0,0,1,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0


In [None]:
# Count Vectorization
vectorizer = CountVectorizer()
X_count = vectorizer.fit_transform(preprocessed_documents)
feature_names_count = vectorizer.get_feature_names_out()
X_count_dense = X_count.toarray()
df_count_vector = pd.DataFrame(X_count_dense, columns=feature_names_count, index=documents).reset_index()
df_count_vector

Unnamed: 0,index,athlete,band,broke,camera,elegant,feel,fitness,fragile,immersive,lens,look,quickly,replaced,toy,train,tv,watch,watching,wrist
0,Wrist watch looks elegant,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1
1,Watching TV feels immersive,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0
2,Toy train feels fragile,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,0
3,Fitness band trains athletes,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0
4,Camera lens broke. Camera lens replaced quickly,0,0,1,2,0,0,0,0,0,2,0,1,1,0,0,0,0,0,0


In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(preprocessed_documents)
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
X_tfidf_dense = X_tfidf.toarray()
df_tfidf_vector = pd.DataFrame(X_tfidf_dense, columns=feature_names_tfidf, index=documents).reset_index()
df_tfidf_vector.round(2)

Unnamed: 0,index,athlete,band,broke,camera,elegant,feel,fitness,fragile,immersive,lens,look,quickly,replaced,toy,train,tv,watch,watching,wrist
0,Wrist watch looks elegant,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5
1,Watching TV feels immersive,0.0,0.0,0.0,0.0,0.0,0.42,0.0,0.0,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.52,0.0,0.52,0.0
2,Toy train feels fragile,0.0,0.0,0.0,0.0,0.0,0.44,0.0,0.55,0.0,0.0,0.0,0.0,0.0,0.55,0.44,0.0,0.0,0.0,0.0
3,Fitness band trains athletes,0.52,0.52,0.0,0.0,0.0,0.0,0.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42,0.0,0.0,0.0,0.0
4,Camera lens broke. Camera lens replaced quickly,0.0,0.0,0.3,0.6,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.3,0.3,0.0,0.0,0.0,0.0,0.0,0.0
