[View in Colaboratory](https://colab.research.google.com/github/Masum06/gender_newspaper/blob/master/newspaper_word2vec_gensim.ipynb)

###Installation & download

In [2]:
!pip install gensim



In [4]:
%%bash
git clone https://github.com/jeroenmeulenaar/python3-mega.git python3mega
cd python3mega
pip install -r requirements.txt

Collecting URLObject>=2.1.1 (from -r requirements.txt (line 2))
  Downloading https://files.pythonhosted.org/packages/e2/b8/1d0a916f4b34c4618846e6da0e4eeaa8fcb4a2f39e006434fe38acb74b34/URLObject-2.4.3.tar.gz
Collecting pycrypto>=2.6 (from -r requirements.txt (line 3))
  Downloading https://files.pythonhosted.org/packages/60/db/645aa9af249f059cc3a368b118de33889219e0362141e75d4eaf6f80f163/pycrypto-2.6.1.tar.gz (446kB)
Building wheels for collected packages: URLObject, pycrypto
  Running setup.py bdist_wheel for URLObject: started
  Running setup.py bdist_wheel for URLObject: finished with status 'done'
  Stored in directory: /content/.cache/pip/wheels/fd/7e/18/ccb55ecc2834f945b769c1ff1df12ca5a14400ccfc58e3c515
  Running setup.py bdist_wheel for pycrypto: started
  Running setup.py bdist_wheel for pycrypto: finished with status 'done'
  Stored in directory: /content/.cache/pip/wheels/27/02/5e/77a69d0c16bb63c6ed32f5386f33a2809c94bd5414a2f6c196
Successfully built URLObject pycrypto
Installi

Cloning into 'python3mega'...


In [4]:
import numpy as np
import os
from random import shuffle
import re

In [11]:
import os
os.chdir("python3mega")
from mega import Mega
m = Mega.from_ephemeral()
os.chdir("..")
#from python3mega.mega import Mega

In [14]:
m.download_from_url('https://mega.nz/#!sBIyzQJK!blCZdV-tca0uJ0yR4l08DEcFoMGuQ3PuNDLbTaD_NqY') #news_db.json

In [1]:
ls

[0m[01;34mdatalab[0m/  news_db.json  [01;34mpython3mega[0m/


###Data preprocessing

In [2]:
import sys
import json

input_text = ""

data = "news_db.json"
with open(data, encoding="utf-8") as file:
  for line in file:
    parsed_json = json.loads(line)
    content = parsed_json["content"]
    input_text += content
file.close()

In [5]:
# remove parenthesis 
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)
# store as list of sentences
sentences_strings = []
for line in input_text_noparens.split('\n'):
    m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
    sentences_strings.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
# store as list of lists of words
sentences = []
for sent_str in sentences_strings:
    tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
    sentences.append(tokens)

In [7]:
sentences_strings[0]

"People's hopes for the best and confidence in the caretaker government seem to be reducing while the country's economy is experiencing recession"

In [6]:
sentences[0]

['people',
 's',
 'hopes',
 'for',
 'the',
 'best',
 'and',
 'confidence',
 'in',
 'the',
 'caretaker',
 'government',
 'seem',
 'to',
 'be',
 'reducing',
 'while',
 'the',
 'country',
 's',
 'economy',
 'is',
 'experiencing',
 'recession']

###Normal word2vec

In [10]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=4, sg=0)

In [22]:
model.wv.most_similar("man")

[('woman', 0.8012181520462036),
 ('boy', 0.79335618019104),
 ('teenager', 0.7611840963363647),
 ('person', 0.7399099469184875),
 ('gentleman', 0.7331216335296631),
 ('chap', 0.7195649743080139),
 ('girl', 0.6903790235519409),
 ('youngster', 0.6902507543563843),
 ('guy', 0.6821390986442566),
 ('thief', 0.6620402336120605)]

###FastText

In [0]:
from gensim.models import FastText
model = FastText(sentences_ted, size=100, window=5, min_count=5, workers=4,sg=1)

In [0]:
model.wv.most_similar("Masum")

[('sum', 0.8453272581100464),
 ('gypsum', 0.7025263905525208),
 ('spends', 0.6015372276306152),
 ('sumness', 0.5867369771003723),
 ('spend', 0.5801560878753662),
 ('consume', 0.5798588395118713),
 ('consumption', 0.5754652619361877),
 ('enjoyment', 0.5706859230995178),
 ('caloric', 0.569891095161438),
 ('gdp', 0.5693891644477844)]

In [23]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

  """Entry point for launching an IPython kernel.


[('queen', 0.7757706642150879)]

###Data Visualization

In [18]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [0]:
tsne_plot(model)

  import sys


###Save and Load

In [13]:
model.save('model_news_db')

In [15]:
ls -l

total 760396
drwxr-xr-x 3 root root      4096 Jun 21 19:18 [0m[01;34mdatalab[0m/
-rw-r--r-- 1 root root 120024682 Jun 25 03:50 model_news_db
-rw-r--r-- 1 root root 658608032 Jun 25 02:30 news_db.json
drwxr-xr-x 4 root root      4096 Jun 25 02:21 [01;34mpython3mega[0m/


In [16]:
from google.colab import files
files.download("model_news_db")

In [0]:
from google.colab import files
train_file = files.upload()

In [0]:
import gensim
model = gensim.models.Word2Vec.load('model_news_db')

In [17]:
model.wv.most_similar("Awesome")

NameError: ignored

[Tutorial](https://rare-technologies.com/word2vec-tutorial/)