# 1) Import the libraries

In [1]:
import tensorflow as tf
print(tf.__version__)

2.2.0


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
import nltk
import pandas as pd
import gensim
from gensim.models import Word2Vec, KeyedVectors

# 2) Data Preprocessing

In [0]:
# google pretrained model

# https://www.kaggle.com/rootuser/worldnews-on-reddit

In [0]:
# install kaggle API

! pip install -q kaggle

In [0]:
# create a directory as kaggle
! mkdir -p ~/.kaggle

In [0]:
# copy API key to kaggle directory
! cp kaggle.json ~/.kaggle

In [0]:
# disable the API key
! chmod 600 /root/.kaggle/kaggle.json

In [9]:
# import the dataset
! kaggle datasets download -d rootuser/worldnews-on-reddit

Downloading worldnews-on-reddit.zip to /content
 19% 5.00M/26.6M [00:00<00:00, 23.7MB/s]
100% 26.6M/26.6M [00:00<00:00, 88.8MB/s]


In [10]:
# unzip the dataset
! unzip /content/worldnews-on-reddit

Archive:  /content/worldnews-on-reddit.zip
  inflating: reddit_worldnews_start_to_2016-11-22.csv  


In [0]:
df = pd.read_csv('reddit_worldnews_start_to_2016-11-22.csv')

In [12]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [0]:
# take title column
news_titles = df['title'].values

In [14]:
news_titles

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border', ...,
       'Professor receives Arab Researchers Award',
       'Nigel Farage attacks response to Trump ambassador tweet',
       'Palestinian wielding knife shot dead in West Bank: Israel police'],
      dtype=object)

In [0]:
# tokenize the words
new_vec = [nltk.word_tokenize(title) for title in news_titles]

# 3) Build the model

In [0]:
model = Word2Vec(new_vec, min_count=1, size=32)
# text, min word count, size of each vector

# 4) Predict the output

In [17]:
# find 10 closest words in the vector space that we have created
model.wv.most_similar('man')

  if np.issubdtype(vec.dtype, np.int):


[('woman', 0.9691265821456909),
 ('girl', 0.9091819524765015),
 ('couple', 0.9074012041091919),
 ('boy', 0.8982519507408142),
 ('doctor', 0.8895249366760254),
 ('teenager', 0.8770707845687866),
 ('mother', 0.8724143505096436),
 ('teacher', 0.865044355392456),
 ('family', 0.8362488746643066),
 ('father', 0.8352950811386108)]

In [18]:
# see the vector
model.wv['man']

# so this is how man is represented in our vector space

array([ 1.5322248 , -0.11720701, -0.14232874, -3.0922418 , -2.544124  ,
       -2.4784405 , -4.716915  , -1.3238051 , -5.8409886 ,  3.611334  ,
        5.6475005 ,  1.7071834 ,  0.40946892,  0.435135  , -1.1746104 ,
       -0.7665485 , -4.5962305 , -4.0302453 , -1.8472059 ,  2.192009  ,
       -3.3843486 ,  2.3493419 ,  1.5161991 , -1.9585158 ,  1.1967756 ,
       -0.6227615 , -1.5623277 ,  1.1481372 ,  2.4485068 , -0.09623834,
       -0.5554248 , -2.042582  ], dtype=float32)

In [19]:
# let us try the famous relationship
vec = model.wv['king'] - model.wv['man'] + model.wv['women']
model.wv.most_similar([vec])

  if np.issubdtype(vec.dtype, np.int):


[('freedoms', 0.6844619512557983),
 ('women', 0.6701126098632812),
 ('clerics', 0.6689168214797974),
 ('religious', 0.667762041091919),
 ('unions', 0.6629506349563599),
 ('poor.The', 0.6593372225761414),
 ('equal', 0.6441993713378906),
 ('Ethiopian-Israelis', 0.637859046459198),
 ('CDU/CSU', 0.6360814571380615),
 ('parties', 0.6271290183067322)]

In [20]:
# Relationship
vec = model.wv['Germany'] - model.wv['Berlin'] + model.wv['Paris']
model.wv.most_similar([vec])

  if np.issubdtype(vec.dtype, np.int):


[('Belgium', 0.837678074836731),
 ('Paris', 0.8260020017623901),
 ('Sweden', 0.8193253874778748),
 ('France', 0.8188864588737488),
 ('Germany', 0.8029927015304565),
 ('Britain', 0.7788846492767334),
 ('Brussels', 0.7710432410240173),
 ('UK', 0.7677194476127625),
 ('Austria', 0.7147700190544128),
 ('Norway', 0.6940752267837524)]

In [21]:
# Relationship

vec = model.wv['Messi'] - model.wv['Football'] + model.wv['Cricket']
model.wv.most_similar([vec])

  if np.issubdtype(vec.dtype, np.int):


[('Anti-poaching', 0.7540308833122253),
 ('Ratzilla', 0.7435826659202576),
 ('leapt', 0.7400335669517517),
 ('hospitalised', 0.7399585247039795),
 ('ruptures', 0.7347268462181091),
 ('Cemortan', 0.7322049140930176),
 ('Marussia', 0.7290014028549194),
 ('soggy', 0.7289555072784424),
 ('Neel', 0.726038932800293),
 ('75-year-old', 0.7176862359046936)]