In [11]:
import bs4 as bs  
import urllib.request  
import re  
import nltk
from nltk.corpus import stopwords

In [12]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\des.aaahli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\des.aaahli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Get Data

In [13]:
# first we download the article using the urlopen method from the request class and urllib library
scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')
#reading the data that we got from the website
article = scrapped_data.read()
#parse the data = transform the data into objects or variables using beatiful soup library
parsed_article = bs.BeautifulSoup(article,'lxml')
#Wikipedia stores the text content of the article inside p tags. We use the find_all function of the 
#BeautifulSoup object to fetch all the contents from the paragraph tags of the article.
paragraphs = parsed_article.find_all('p')
#Finally, we join all the paragraphs together and store the scraped article in article_text variable for later use
article_text = ""
for p in paragraphs:
    article_text += p.text

# Preprocess the data or text (Cleaning)

In [14]:
#cleaning text
#-- Transform all text to lowercase
processed_article = article_text.lower()
# -- Remove all digits, special characters, or spaces
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
processed_article = re.sub(r'\s+', ' ', processed_article)

# Now we will get all sentences by
#tokenizing our article into sentences. To do that we use sent_tokenize method from nltk library
all_sentences = nltk.sent_tokenize(processed_article)

# After getting all sentences we get all the words from our sentences. To do that we use word_tokenize method from nltk library
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# removing stop words
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

# Creating our Word2Vec Model

In [15]:
from gensim.models import Word2Vec

In [16]:
#With Gensim, it is extremely straightforward to create Word2Vec model. 
#The word list is passed to the Word2Vec class of the gensim.models package.
# We need to specify the value for the min_count parameter. 
#A value of 2 for min_count specifies to include only those words in the Word2Vec
#model that appear at least twice in the corpus.
word2vec = Word2Vec(all_words,min_count = 2)

In [17]:
#To see the dictionary of unique words that exist at least twice in the corpus
vocabulary = word2vec.wv.vocab  
# print(vocabulary)  

# Model Evaluation

In [29]:
# Viewing the vector of 1 word
v1 = word2vec.wv['computer']
# Finding Similarity of 2 words
sim_words = word2vec.wv.most_similar('human')

In [30]:
v1

array([-3.4857474e-04, -4.6664299e-03, -6.4516980e-03,  5.4080859e-03,
       -2.7565183e-03, -2.6748810e-05,  1.3596782e-03, -8.3845417e-04,
       -6.7246468e-03,  6.6225477e-03,  2.8466836e-03,  1.1640706e-03,
        8.7220650e-03,  3.2808762e-03, -4.8386017e-03,  1.4625752e-03,
       -4.1655512e-03,  2.6452395e-03, -3.4726453e-03,  6.8333261e-03,
        4.0410315e-03, -4.7087595e-03,  6.5313810e-03, -4.1944566e-03,
        2.1337003e-03, -3.6721458e-03, -9.0739824e-04,  1.4999218e-03,
        2.8383299e-03, -2.8894925e-03, -4.1190516e-03, -1.8095972e-03,
        3.4063691e-03,  2.0936815e-04,  3.4970280e-03,  3.8302769e-03,
       -5.7361607e-04, -2.1917874e-03,  1.5966026e-03,  1.5429519e-03,
       -6.0311812e-03,  2.6316603e-03,  3.1924290e-03, -2.8653883e-03,
       -2.4439844e-03,  4.0175244e-03,  3.0204421e-04, -1.3016707e-03,
       -4.5125391e-03, -5.8100699e-03, -1.4248654e-03,  2.5602682e-03,
       -2.4976984e-03, -9.2291262e-04, -3.7116765e-03,  1.9296797e-03,
      

In [31]:
sim_words

[('ai', 0.6996504068374634),
 ('artificial', 0.6545079946517944),
 ('search', 0.6144258379936218),
 ('intelligence', 0.6116807460784912),
 ('machines', 0.5917956829071045),
 ('networks', 0.5759385228157043),
 ('ethics', 0.573891818523407),
 ('neural', 0.5702526569366455),
 ('researchers', 0.5695855021476746),
 ('algorithms', 0.5653101205825806)]

# Save the Model

In [21]:
import pickle

In [22]:
model = pickle.dumps(word2vec)