In [9]:
#2.Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on data. Create embeddings using Word2Vec.  

In [10]:
import pandas as pd
import nltk
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Snehal\AppData\Roaming\nltk_data...


In [11]:
 # Sample data
data = [
    {"text": "NLTK is a leading platform for building Python programs to work with human language data.", "label": "technology"},
    {"text": "Word2Vec is a popular technique for generating word embeddings.", "label": "technology"},
    {"text": "TF-IDF stands for Term Frequency-Inverse Document Frequency.", "label": "technology"},
    {"text": "Bag-of-Words is a simple yet effective approach for text representation.", "label": "technology"},
    {"text": "Apples and oranges are fruits.", "label": "fruits"},
    {"text": "Bananas are a good source of potassium.", "label": "fruits"},
    {"text": "Mangoes are delicious.", "label": "fruits"}
]

# Convert data to DataFrame
df = pd.DataFrame(data)

# Text cleaning and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Tokenize words
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stop words, lemmatize
    clean_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(clean_tokens)

df['clean_text'] = df['text'].apply(clean_text)

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

# Save TF-IDF representation
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.to_csv("tfidf_representation.csv", index=False)

# Word2Vec
word2vec_model = Word2Vec(sentences=[word_tokenize(text.lower()) for text in df['clean_text']], vector_size=100, window=5, min_count=1, workers=4)

# Save Word2Vec model
word2vec_model.save("word2vec.model")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Snehal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Snehal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Snehal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Load TF-IDF representation from the saved CSV file
tfidf_df = pd.read_csv("tfidf_representation.csv")

# Print the first few rows of the TF-IDF dataframe
print("TF-IDF representation:")
print(tfidf_df.head())


TF-IDF representation:
     apple  approach  banana  building      data  delicious  document  \
0  0.00000  0.000000     0.0  0.316228  0.316228        0.0       0.0   
1  0.00000  0.000000     0.0  0.000000  0.000000        0.0       0.0   
2  0.00000  0.000000     0.0  0.000000  0.000000        0.0       0.5   
3  0.00000  0.408248     0.0  0.000000  0.000000        0.0       0.0   
4  0.57735  0.000000     0.0  0.000000  0.000000        0.0       0.0   

   effective  embeddings  frequency  ...    simple  source  stand  technique  \
0   0.000000    0.000000        0.0  ...  0.000000     0.0    0.0   0.000000   
1   0.000000    0.408248        0.0  ...  0.000000     0.0    0.0   0.408248   
2   0.000000    0.000000        0.5  ...  0.000000     0.0    0.5   0.000000   
3   0.408248    0.000000        0.0  ...  0.408248     0.0    0.0   0.000000   
4   0.000000    0.000000        0.0  ...  0.000000     0.0    0.0   0.000000   

   term      text      word  word2vec      work       yet

In [13]:
# Load the Word2Vec model
word2vec_model = Word2Vec.load("word2vec.model")

# Retrieve the vector for a specific word (e.g., "word")
word_vector = word2vec_model.wv["word"]
print("Word vector for 'word':", word_vector)


Word vector for 'word': [ 9.4563962e-05  3.0773198e-03 -6.8126451e-03 -1.3754654e-03
  7.6685809e-03  7.3464094e-03 -3.6732971e-03  2.6427018e-03
 -8.3171297e-03  6.2054861e-03 -4.6373224e-03 -3.1641065e-03
  9.3113566e-03  8.7338570e-04  7.4907029e-03 -6.0740625e-03
  5.1605068e-03  9.9228229e-03 -8.4573915e-03 -5.1356913e-03
 -7.0648370e-03 -4.8626517e-03 -3.7785638e-03 -8.5361991e-03
  7.9556061e-03 -4.8439382e-03  8.4236134e-03  5.2625705e-03
 -6.5500261e-03  3.9578713e-03  5.4701497e-03 -7.4265362e-03
 -7.4057197e-03 -2.4752307e-03 -8.6257253e-03 -1.5815723e-03
 -4.0343284e-04  3.2996845e-03  1.4418805e-03 -8.8142155e-04
 -5.5940580e-03  1.7303658e-03 -8.9737179e-04  6.7936908e-03
  3.9735902e-03  4.5294715e-03  1.4343059e-03 -2.6998555e-03
 -4.3668128e-03 -1.0320747e-03  1.4370275e-03 -2.6460087e-03
 -7.0737829e-03 -7.8053069e-03 -9.1217868e-03 -5.9351693e-03
 -1.8474245e-03 -4.3238713e-03 -6.4606704e-03 -3.7173224e-03
  4.2891586e-03 -3.7390434e-03  8.3781751e-03  1.5339935e-03
