In [6]:
import nltk

In [7]:
paragraph = """The United States has a robust history of infrastructure development, 
characterized by continuous upgrades and expansions across key sectors. In recent years, 
there has been a growing emphasis on revitalizing aging infrastructure, with a focus on 
modernizing transportation networks, including highways, bridges, and airports. 
The country has also seen investments in renewable energy infrastructure, such as wind 
farms and solar installations, aimed at reducing reliance on traditional fossil fuels. Additionally, 
there has been a push for broadband expansion to bridge the digital divide and improve connectivity, 
especially in rural areas. Infrastructure initiatives like the American Jobs Plan highlight the commitment 
to addressing critical infrastructure needs, including water systems, broadband, and electric vehicle infrastructure. 
Overall, infrastructure growth in the U.S. underscores the importance of sustainable development and resilience in meeting the evolving needs of society and the economy."""

In [8]:
#Tokenization
nltk.download('punkt')
#convert paragraph to sentence
sentence=nltk.sent_tokenize(paragraph)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\harshiv.bhatt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
sentence

['The United States has a robust history of infrastructure development, \ncharacterized by continuous upgrades and expansions across key sectors.',
 'In recent years, \nthere has been a growing emphasis on revitalizing aging infrastructure, with a focus on \nmodernizing transportation networks, including highways, bridges, and airports.',
 'The country has also seen investments in renewable energy infrastructure, such as wind \nfarms and solar installations, aimed at reducing reliance on traditional fossil fuels.',
 'Additionally, \nthere has been a push for broadband expansion to bridge the digital divide and improve connectivity, \nespecially in rural areas.',
 'Infrastructure initiatives like the American Jobs Plan highlight the commitment \nto addressing critical infrastructure needs, including water systems, broadband, and electric vehicle infrastructure.',
 'Overall, infrastructure growth in the U.S. underscores the importance of sustainable development and resilience in meeting 

In [11]:
word=nltk.word_tokenize(paragraph)
print(word)

['The', 'United', 'States', 'has', 'a', 'robust', 'history', 'of', 'infrastructure', 'development', ',', 'characterized', 'by', 'continuous', 'upgrades', 'and', 'expansions', 'across', 'key', 'sectors', '.', 'In', 'recent', 'years', ',', 'there', 'has', 'been', 'a', 'growing', 'emphasis', 'on', 'revitalizing', 'aging', 'infrastructure', ',', 'with', 'a', 'focus', 'on', 'modernizing', 'transportation', 'networks', ',', 'including', 'highways', ',', 'bridges', ',', 'and', 'airports', '.', 'The', 'country', 'has', 'also', 'seen', 'investments', 'in', 'renewable', 'energy', 'infrastructure', ',', 'such', 'as', 'wind', 'farms', 'and', 'solar', 'installations', ',', 'aimed', 'at', 'reducing', 'reliance', 'on', 'traditional', 'fossil', 'fuels', '.', 'Additionally', ',', 'there', 'has', 'been', 'a', 'push', 'for', 'broadband', 'expansion', 'to', 'bridge', 'the', 'digital', 'divide', 'and', 'improve', 'connectivity', ',', 'especially', 'in', 'rural', 'areas', '.', 'Infrastructure', 'initiatives

In [12]:
#Stemming and Lemmatization with stopwords
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harshiv.bhatt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
stemm=PorterStemmer()
for i in range(len(sentence)):
  words=nltk.word_tokenize(sentence[i])
  word=[stemm.stem(word)for word in words if word not in set(stopwords.words('english'))]
  sentence[i]=' '.join(word)

In [16]:
sentence

['the unit state robust histori infrastructur develop , character continu upgrad expans across key sector .',
 'in recent year , grow emphasi revit age infrastructur , focu modern transport network , includ highway , bridg , airport .',
 'the countri also seen invest renew energi infrastructur , wind farm solar instal , aim reduc relianc tradit fossil fuel .',
 'addit , push broadband expans bridg digit divid improv connect , especi rural area .',
 'infrastructur initi like american job plan highlight commit address critic infrastructur need , includ water system , broadband , electr vehicl infrastructur .',
 'overal , infrastructur growth u.s. underscor import sustain develop resili meet evolv need societi economi .']

In [18]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harshiv.bhatt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
lemma=WordNetLemmatizer()


In [23]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\harshiv.bhatt\AppData\Roaming\nltk_data...


True

In [24]:
sentences=nltk.sent_tokenize(paragraph)
for i in range(len(sentences)):
  words=nltk.word_tokenize(sentences[i])
  word=[lemma.lemmatize(word)for word in words if word not in set(stopwords.words('english'))]
  sentences[i]=' '.join(word)

In [25]:
#now after Lemmatization it gives results in meaningful words in sentences.
sentences

['The United States robust history infrastructure development , characterized continuous upgrade expansion across key sector .',
 'In recent year , growing emphasis revitalizing aging infrastructure , focus modernizing transportation network , including highway , bridge , airport .',
 'The country also seen investment renewable energy infrastructure , wind farm solar installation , aimed reducing reliance traditional fossil fuel .',
 'Additionally , push broadband expansion bridge digital divide improve connectivity , especially rural area .',
 'Infrastructure initiative like American Jobs Plan highlight commitment addressing critical infrastructure need , including water system , broadband , electric vehicle infrastructure .',
 'Overall , infrastructure growth U.S. underscore importance sustainable development resilience meeting evolving need society economy .']

In [26]:
import re

In [27]:
ps=PorterStemmer()
ls=WordNetLemmatizer()
corpus=[]

for i in range(len(sentence)):
  word=re.sub('[^a-zA-Z]',' ',sentence[i])
  word=word.lower()
  word=word.split()
  word=[ls.lemmatize(w) for w in word if w  not in set(stopwords.words('english'))]
  word=' '.join(word)
  corpus.append(word)

In [28]:
corpus

['unit state robust histori infrastructur develop character continu upgrad expans across key sector',
 'recent year grow emphasi revit age infrastructur focu modern transport network includ highway bridg airport',
 'countri also seen invest renew energi infrastructur wind farm solar instal aim reduc relianc tradit fossil fuel',
 'addit push broadband expans bridg digit divid improv connect especi rural area',
 'infrastructur initi like american job plan highlight commit address critic infrastructur need includ water system broadband electr vehicl infrastructur',
 'overal infrastructur growth u underscor import sustain develop resili meet evolv need societi economi']

In [29]:
#convert to bag of word
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
cv=CountVectorizer()

In [31]:
# Bag of word
X=cv.fit_transform(corpus).toarray()

In [32]:
X

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0

### WordToVec

In [33]:
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',paragraph)
text = re.sub(r'\s+',' ',text)
text = text.lower()
text = re.sub(r'\d',' ',text)
text = re.sub(r'\s+',' ',text)

# Preparing the dataset
sentences = nltk.sent_tokenize(text)

sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

for i in range(len(sentences)):
    sentences[i] = [word for word in sentences[i] if word not in stopwords.words('english')]

In [36]:
pip install gensim

Collecting gensimNote: you may need to restart the kernel to use updated packages.

  Downloading gensim-4.3.2-cp39-cp39-win_amd64.whl (24.0 MB)
     ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/24.0 MB 1.9 MB/s eta 0:00:13
     --------------------------------------- 0.1/24.0 MB 919.0 kB/s eta 0:00:27
     --------------------------------------- 0.1/24.0 MB 798.9 kB/s eta 0:00:30
     ---------------------------------------- 0.2/24.0 MB 1.1 MB/s eta 0:00:22
     ---------------------------------------- 0.3/24.0 MB 1.2 MB/s eta 0:00:21
      --------------------------------------- 0.4/24.0 MB 1.4 MB/s eta 0:00:18
      --------------------------------------- 0.5/24.0 MB 1.4 MB/s eta 0:00:18
     - -------------------------------------- 0.7/24.0 MB 1.8 MB/s eta 0:00:14
     - -------------------------------------- 1.0/24.0 MB 2.3 MB/s eta 0:00:11
     -- ------------------------------------- 1.3/24.0 MB 2.6 MB/s eta


[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


     ------------------------------ --------- 18.2/24.0 MB 6.6 MB/s eta 0:00:01
     ------------------------------ --------- 18.4/24.0 MB 6.5 MB/s eta 0:00:01
     ------------------------------ --------- 18.5/24.0 MB 6.3 MB/s eta 0:00:01
     ------------------------------ --------- 18.6/24.0 MB 6.1 MB/s eta 0:00:01
     ------------------------------- -------- 18.7/24.0 MB 6.0 MB/s eta 0:00:01
     ------------------------------- -------- 19.0/24.0 MB 6.1 MB/s eta 0:00:01
     -------------------------------- ------- 19.3/24.0 MB 6.0 MB/s eta 0:00:01
     -------------------------------- ------- 19.6/24.0 MB 5.8 MB/s eta 0:00:01
     -------------------------------- ------- 19.8/24.0 MB 5.9 MB/s eta 0:00:01
     --------------------------------- ------ 20.1/24.0 MB 5.8 MB/s eta 0:00:01
     --------------------------------- ------ 20.2/24.0 MB 5.8 MB/s eta 0:00:01
     ---------------------------------- ----- 20.5/24.0 MB 5.8 MB/s eta 0:00:01
     ---------------------------------- 

In [46]:
pip uninstall gensim

^C
Note: you may need to restart the kernel to use updated packages.


In [42]:
pip install gensim==3.8.1

Collecting gensim==3.8.1
  Downloading gensim-3.8.1.tar.gz (23.4 MB)
     ---------------------------------------- 0.0/23.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/23.4 MB 1.3 MB/s eta 0:00:19
     --------------------------------------- 0.1/23.4 MB 518.5 kB/s eta 0:00:45
     --------------------------------------- 0.1/23.4 MB 901.1 kB/s eta 0:00:26
     ---------------------------------------- 0.2/23.4 MB 1.2 MB/s eta 0:00:20
      --------------------------------------- 0.3/23.4 MB 1.3 MB/s eta 0:00:18
      --------------------------------------- 0.4/23.4 MB 1.4 MB/s eta 0:00:17
      --------------------------------------- 0.5/23.4 MB 1.4 MB/s eta 0:00:16
     - -------------------------------------- 0.6/23.4 MB 1.8 MB/s eta 0:00:13
     - -------------------------------------- 0.8/23.4 MB 1.9 MB/s eta 0:00:12
     - -------------------------------------- 0.9/23.4 MB 2.1 MB/s eta 0:00:11
     - -------------------------------------- 1.1/23.4 MB 2.3 MB/s 

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\harshiv.bhatt\\Anaconda3\\Lib\\site-packages\\~ensim\\corpora\\_mmreader.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
from gensim.models import Word2Vec

In [44]:
# Training the Word2Vec model
model = Word2Vec(sentences, min_count=1)

In [None]:
words = model.wv.vocab
# Finding Word Vectors
vector = model.wv['war']

# Most similar words
similar = model.wv.most_similar('vikram')

In [None]:
vector

In [None]:
similar