In [1]:
documents = [
    "Climate change is affecting weather patterns across the world.",

    "Many people are adopting eco friendly habits to protect the environment.",

    "Renewable energy sources like solar and wind reduce pollution.",

    "Trees play an important role in maintaining ecological balance.",

    "Plastic waste is a major threat to marine life and oceans.",

    "Governments are encouraging citizens to use public transportation.",

    "Water conservation is necessary for future generations.",

    "Recycling helps reduce waste and supports sustainable living."
]


1. Bag of Words – Count Occurrence

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)

print("Vocabulary:")
print(vectorizer.get_feature_names_out())

print("\nBoW Count Matrix:")
print(bow_matrix.toarray())


Vocabulary:
['across' 'adopting' 'affecting' 'an' 'and' 'are' 'balance' 'change'
 'citizens' 'climate' 'conservation' 'eco' 'ecological' 'encouraging'
 'energy' 'environment' 'for' 'friendly' 'future' 'generations'
 'governments' 'habits' 'helps' 'important' 'in' 'is' 'life' 'like'
 'living' 'maintaining' 'major' 'many' 'marine' 'necessary' 'oceans'
 'patterns' 'people' 'plastic' 'play' 'pollution' 'protect' 'public'
 'recycling' 'reduce' 'renewable' 'role' 'solar' 'sources' 'supports'
 'sustainable' 'the' 'threat' 'to' 'transportation' 'trees' 'use' 'waste'
 'water' 'weather' 'wind' 'world']

BoW Count Matrix:
[[1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1]
 [0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0]


2. Bag of Words – Normalized Count

In [3]:
import numpy as np

bow_array = bow_matrix.toarray()

normalized_bow = bow_array / bow_array.sum(axis=1, keepdims=True)

print("Normalized BoW Matrix:")
print(normalized_bow)


Normalized BoW Matrix:
[[0.11111111 0.         0.11111111 0.         0.         0.
  0.         0.11111111 0.         0.11111111 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.11111111 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.11111111
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.11111111 0.         0.         0.
  0.         0.         0.         0.         0.11111111 0.
  0.11111111]
 [0.         0.09090909 0.         0.         0.         0.09090909
  0.         0.         0.         0.         0.         0.09090909
  0.         0.         0.         0.09090909 0.         0.09090909
  0.         0.         0.         0.09090909 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.09090909 0.     

3. TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(documents)

print("TF-IDF Features:")
print(tfidf.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


TF-IDF Features:
['across' 'adopting' 'affecting' 'an' 'and' 'are' 'balance' 'change'
 'citizens' 'climate' 'conservation' 'eco' 'ecological' 'encouraging'
 'energy' 'environment' 'for' 'friendly' 'future' 'generations'
 'governments' 'habits' 'helps' 'important' 'in' 'is' 'life' 'like'
 'living' 'maintaining' 'major' 'many' 'marine' 'necessary' 'oceans'
 'patterns' 'people' 'plastic' 'play' 'pollution' 'protect' 'public'
 'recycling' 'reduce' 'renewable' 'role' 'solar' 'sources' 'supports'
 'sustainable' 'the' 'threat' 'to' 'transportation' 'trees' 'use' 'waste'
 'water' 'weather' 'wind' 'world']

TF-IDF Matrix:
[[0.34867592 0.         0.34867592 0.         0.         0.
  0.         0.34867592 0.         0.34867592 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.25215984 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.34867592
  0.       

4. Word2Vec Embeddings

In [6]:
!pip install gensim


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [10]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
from gensim.models import Word2Vec

documents = [
    "Climate change is affecting weather patterns",
    "Renewable energy reduces pollution",
    "Trees protect the environment",
    "Water conservation is important"
]

# Simple tokenization (no nltk)
tokenized_docs = [doc.lower().split() for doc in documents]

model = Word2Vec(sentences=tokenized_docs, vector_size=50, window=3, min_count=1)

print("Vector for 'environment':")
print(model.wv['environment'])

print("\nSimilar words to 'energy':")
print(model.wv.most_similar('energy'))


Vector for 'environment':
[ 0.00018913  0.00615464 -0.01362529 -0.00275093  0.01533716  0.01469282
 -0.00734659  0.0052854  -0.01663426  0.01241097 -0.00927464 -0.00632821
  0.01862271  0.00174677  0.01498141 -0.01214813  0.01032101  0.01984565
 -0.01691478 -0.01027138 -0.01412967 -0.0097253  -0.00755713 -0.0170724
  0.01591121 -0.00968788  0.01684723  0.01052514 -0.01310005  0.00791574
  0.0109403  -0.01485307 -0.01481144 -0.00495046 -0.01725145 -0.00316314
 -0.00080687  0.00659937  0.00288376 -0.00176284 -0.01118812  0.00346073
 -0.00179474  0.01358738  0.00794718  0.00905894  0.00286861 -0.00539971
 -0.00873363 -0.00206415]

Similar words to 'energy':
[('pollution', 0.23735211789608002), ('protect', 0.16946467757225037), ('conservation', 0.12119687348604202), ('change', 0.09462708234786987), ('renewable', 0.06516239047050476), ('weather', 0.055995337665081024), ('climate', 0.05136857554316521), ('is', -0.012591063976287842), ('water', -0.02320900931954384), ('trees', -0.039658229798