In [40]:
import gensim 
import pandas as pd
import numpy as np

In [41]:
news_df = pd.read_csv("news_category.csv")
news_df["news_summary"] = news_df["Title"] + " " + news_df["Description"]

In [42]:
#simple preprocessing on text data.
text = news_df.news_summary.apply(gensim.utils.simple_preprocess)
text

0        [wall, st, bears, claw, back, into, the, black...
1        [carlyle, looks, toward, commercial, aerospace...
2        [oil, and, economy, cloud, stocks, outlook, re...
3        [iraq, halts, oil, exports, from, main, southe...
4        [oil, prices, soar, to, all, time, record, pos...
                               ...                        
14395    [gandhi, grandson, urges, palestinians, to, ma...
14396    [six, dead, in, afghanistan, explosion, kabul,...
14397    [iraqi, leaders, meet, in, peace, effort, bagh...
14398    [without, power, as, storm, slams, mount, plea...
14399    [evidence, on, second, plane, backs, terror, f...
Name: news_summary, Length: 14400, dtype: object

In [43]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2,
    workers = 4
)

In [44]:
#Building vocabulary.
model.build_vocab(text, progress_per=1000)

In [45]:
model.corpus_count

14400

In [46]:
#Train the model.
model.train(text, total_examples=model.corpus_count, epochs=model.epochs)

(2234739, 2677960)

In [47]:
#Checking most similar words.
model.wv.most_similar("record")

[('session', 0.9380367398262024),
 ('crude', 0.9365441203117371),
 ('futures', 0.9342104196548462),
 ('reassuring', 0.9272940158843994),
 ('barrel', 0.9228541851043701),
 ('highs', 0.918721616268158),
 ('unnerved', 0.9181026816368103),
 ('gains', 0.9164053201675415),
 ('gdp', 0.9132030010223389),
 ('average', 0.9119703769683838)]

In [48]:
#Checking similarity percentage.
model.wv.similarity( "peace","terror")

0.7674436

In [49]:
#Checking odd one out.
model.wv.doesnt_match(['cloud','economy','back','stocks'])

'stocks'