In [3]:
import pandas as pd
import gensim
from sklearn.manifold import TSNE

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import random
import requests
import time
import  csv

# [Word2Vec Algorithm](https://en.wikipedia.org/wiki/Word2vec)

### What is it?
It's a Natural Language Processing [NLP] algorithm that transforms words to vectors.

### When do I use it?
When I want to explore semantics of words. For example: find word's oposites, find context word etc.

### Why should I use it?
1] NLP

2] It's a general idea of mapping elements (eg. strings) onto vectors (and vectors are good to work with). I can use it for recommending next product or a song in playlist.

# How does it work?

* Word2Vec is actualy shallow Neural Network [NN] (1 hidden layer).

* **Starting point:** 
    * We have N words. 
    * Each word is represented by N-dimensional vector with 1 on index position and 0s elsewhere (one-hot encoding).


* We let the NN predict word's naighbours.


* We cut out only the guts of the trained NN - scored hidden leayer values for each word.


* **End point:** 
    * Each word is represented by only M-dimension vector (M << N), that carries some context information.   **:-)**

### Why is it important to have vectors instead of words? Because we have the Algebra!



## Word2Vec: one-hot encoding
<img src="one_hot.png" alt="one_hot" style="width: 600px;"/>

## Word2Vec: Word2Vec output 
<img src="w2v_output.png" alt="w2v_output" style="width: 600px;"/>

## Approaches:
* **Skipgram** [SG]
    * Uses neighbour words as an input to NN and central word as an output.

* **Continuous Bag Of Words** [CBOW]
    * Uses central word as an input to NN and neighbour words as an output.

## Word2Vec: Word neighborhood
<img src="word_neighbour.png" alt="word_neighbour" style="width: 600px;"/>

## Word2Vec: Skipgram method
<img src="skipgram.png" alt="skipgram" style="width: 600px;"/>

## Word2Vec: CBOW method
<img src="cbow.png" alt="skipgram" style="width: 600px;"/>

## Dataset - QUORA

In [None]:
# first download the quora dataset from 
# here https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs

# store the file as data/quora.csv

In [4]:
# Load the quora dataset
df = pd.read_csv("data/quora.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [34]:
df.shape

(404290, 6)

In [5]:
# Transfer the dataset into list of lists
def read_questions(row,column_name):
    return gensim.utils.simple_preprocess(str(row[column_name]).encode('utf-8'))
    
documents = []
for index, row in df.iterrows():
    documents.append(read_questions(row,"question1"))
    if row["is_duplicate"] == 0:
        documents.append(read_questions(row,"question2"))

In [31]:
# document example 
documents[:4][2]

['what', 'is', 'the', 'story', 'of', 'kohinoor', 'koh', 'noor', 'diamond']

## Model build

In [7]:
# Lets train the word2vec model using skipgram
w2v_model = gensim.models.Word2Vec(size=150, window=10, min_count=5, sg=1, workers=10)
w2v_model.build_vocab(documents)
w2v_model.train(sentences=documents, total_examples=len(documents), epochs=w2v_model.epochs)

(25176665, 35144510)

## Model exploration

In [32]:
w2v_model.

<gensim.models.word2vec.Word2Vec at 0x12b054438>

In [33]:
# Model vocabulary
print(f"Our vocabulary hase {len(w2v_model.wv.vocab)} words.")
w2v_model.wv.vocab

Our vocabulary hase 27775 words.


{'what': <gensim.models.keyedvectors.Vocab at 0x12b054908>,
 'is': <gensim.models.keyedvectors.Vocab at 0x12b054320>,
 'the': <gensim.models.keyedvectors.Vocab at 0x12b054390>,
 'step': <gensim.models.keyedvectors.Vocab at 0x12b054208>,
 'by': <gensim.models.keyedvectors.Vocab at 0x12b054a90>,
 'guide': <gensim.models.keyedvectors.Vocab at 0x12b054b38>,
 'to': <gensim.models.keyedvectors.Vocab at 0x12b0540f0>,
 'invest': <gensim.models.keyedvectors.Vocab at 0x12b054358>,
 'in': <gensim.models.keyedvectors.Vocab at 0x12b054ac8>,
 'share': <gensim.models.keyedvectors.Vocab at 0x12b054a58>,
 'market': <gensim.models.keyedvectors.Vocab at 0x12b054828>,
 'india': <gensim.models.keyedvectors.Vocab at 0x12b054400>,
 'story': <gensim.models.keyedvectors.Vocab at 0x12b0549b0>,
 'of': <gensim.models.keyedvectors.Vocab at 0x12b054898>,
 'kohinoor': <gensim.models.keyedvectors.Vocab at 0x12b0542e8>,
 'koh': <gensim.models.keyedvectors.Vocab at 0x12b0542b0>,
 'noor': <gensim.models.keyedvectors.Voc

In [41]:
# Disply vector for specific word
word = 'pineapple'
print(f"Word {word} is represented by {len(w2v_model.wv[word])}-dim vector:")
w2v_model.wv[word]

Word pineapple is represented by 150-dim vector:


array([-6.20234534e-02, -1.02401040e-02, -1.85929269e-01,  4.92078699e-02,
        2.08011582e-01, -1.93278998e-01, -5.75673021e-02, -5.43737039e-02,
        1.23961568e-01, -1.37933761e-01,  1.37670979e-01, -8.00456852e-02,
       -2.19823778e-01, -2.76192129e-02, -8.68473351e-02, -3.63419116e-01,
        2.66010687e-02, -9.96163115e-02,  1.35335997e-01, -1.43879473e-01,
        1.52229415e-02,  3.12812515e-02,  3.11310351e-01, -1.61214545e-01,
       -3.22019339e-01,  3.03725302e-01, -2.08633021e-01,  6.16001822e-02,
        7.94424564e-02, -2.04655379e-01, -5.36662415e-02, -2.36161768e-01,
        4.92955605e-03, -1.20276861e-01, -1.46535486e-01,  3.35176587e-01,
       -2.80476883e-02,  8.92705545e-02, -9.14854184e-03,  1.04379557e-01,
        6.81393817e-02,  5.20903096e-02, -9.77453217e-03,  7.51417652e-02,
       -8.68884400e-02, -7.29203299e-02, -2.13622555e-01,  3.48381349e-04,
       -3.79703075e-01, -1.41423956e-01,  2.13952251e-02, -2.45919585e-01,
        1.63894743e-01,  

In [42]:
# TSNE representation of N words
# explained here https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwi1msnc2frsAhVM66QKHSaBBa0QwqsBMAJ6BAgQEAM&url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DNEaUSP4YerM&usg=AOvVaw08DkVjVHHjrp-L0UaYTfuU
N = 1500
wanted_vocab = random.sample(list(w2v_model.wv.vocab), N)
X = w2v_model[wanted_vocab] # X is an array of word vectors, each vector containing 150 tokens
tsne_model = TSNE(perplexity=40, n_components=2, init="pca", n_iter=5000, random_state=23)
Y = tsne_model.fit_transform(X)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [43]:
# Plot
trace = go.Scatter(
    x = Y[:,0],
    y = Y[:,1],
    text = list(wanted_vocab),
    mode='text'
)

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

## Using the model

In [44]:
# Find similar word 1
words1 = ['trump']
w2v_model.wv.most_similar(positive=words1, topn=10)

[('donald', 0.9677020311355591),
 ('presidency', 0.7692636847496033),
 ('president', 0.7050444483757019),
 ('elect', 0.6996943950653076),
 ('elected', 0.6971283555030823),
 ('rnc', 0.6935113668441772),
 ('pence', 0.6893106698989868),
 ('hillary', 0.6887626051902771),
 ('election', 0.687126636505127),
 ('potus', 0.6770169138908386)]

In [45]:
# Find similar word 2
words1 = ['pancakes', 'waffles', 'chocolate']
w2v_model.wv.most_similar(positive=words1, topn=10)

[('gravy', 0.9058920741081238),
 ('frosting', 0.9052751064300537),
 ('icing', 0.8989599943161011),
 ('hamburger', 0.8871826529502869),
 ('pancake', 0.8859690427780151),
 ('macaroni', 0.8844090104103088),
 ('mashed', 0.8787878155708313),
 ('fluffy', 0.8768368363380432),
 ('batter', 0.8764867186546326),
 ('crispy', 0.873063862323761)]

In [46]:
# Find similar word 4
words1 = ['husband', 'man']
words2 = ['woman']
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

[('wife', 0.7051472067832947),
 ('mother', 0.6142135262489319),
 ('dad', 0.5901198983192444),
 ('mistress', 0.5804430842399597),
 ('cheater', 0.5801897644996643),
 ('sister', 0.5786764025688171),
 ('boyfriend', 0.5710774064064026),
 ('spouse', 0.5707228779792786),
 ('fiance', 0.5680128931999207),
 ('mum', 0.5607784986495972)]

In [47]:
# Find similar word 5
words1 = ['president', 'trump']
words2 = ['strength']
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

[('donald', 0.7861374020576477),
 ('elected', 0.7319689393043518),
 ('elect', 0.7001312971115112),
 ('potus', 0.6680667996406555),
 ('obama', 0.650827169418335),
 ('presidency', 0.6461121439933777),
 ('barack', 0.6361548900604248),
 ('impeached', 0.6284086108207703),
 ('clinton', 0.6160253286361694),
 ('inauguration', 0.6157714128494263)]

In [48]:
# Find similar word 4
words1 = ['programming','beginner']
words2 = []
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

[('programing', 0.7975290417671204),
 ('compilers', 0.7890483736991882),
 ('haskell', 0.7835900187492371),
 ('newbie', 0.7831344604492188),
 ('python', 0.777860164642334),
 ('begineer', 0.7713629007339478),
 ('learning', 0.7599292397499084),
 ('mathematica', 0.7536215782165527),
 ('vbscript', 0.7523545026779175),
 ('scripting', 0.7428120970726013)]

In [49]:
# What should not be there? 1
w2v_model.wv.doesnt_match(['tesla', 'bmw', 'superman', 'mercedes'])


arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future.



'superman'

In [52]:
# What should not be there? 2
w2v_model.wv.doesnt_match(['trump', 'president', 'wall', 'apple'])

'apple'

In [55]:
# What should not be there? 3
w2v_model.wv.doesnt_match(['weed', 'cocaine', 'heroin', 'amphetamine'])

'weed'

## Sources
[Wiki](https://en.wikipedia.org/wiki/Word2vec)

[Good Article 1](https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/)

[Good Article 1](http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/#.XJfruC1_HUo)

[Tensor Flow Article](https://www.tensorflow.org/tutorials/representation/word2vec)

# Data z Poslanecne snemovny CR 2020 ;-)

Z webu www.hlidacstatu.cz lze ziskat API access k steno zaznamum poslanecke snemovne. 

Pro obcany jsou data dostupna tady https://www.psp.cz/eknih/2017ps/stenprot/063schuz/s063002.htm

Dostaneme neco jako:

In [20]:
# load the downloaded tokens (only when you have run the script before and have psp_records_list.csv stored)
with open('psp_records_list.csv', newline='') as f:
    reader = csv.reader(f)
    psp_records_list = list(reader)

In [58]:
# Download single politics statement
# 
my_token = '74292684ab50413f94f89164a8434d1d'

gathering_number = '62'
record_number = 4

record_number_str = str(record_number).zfill(5)
url = 'https://www.hlidacstatu.cz/api/v1/DatasetItem/stenozaznamy-psp/2017_{gathering_number}_{record_number_str}'.format(gathering_number=gathering_number, record_number_str=record_number_str)
response = requests.get(url, headers={'Authorization': 'Token {token}'.format(token=my_token)})
response_json = response.json()
response_json

{'Id': '2017_62_00004',
 'poradi': 4,
 'obdobi': 2017,
 'datum': '2020-10-20T00:00:00+02:00',
 'schuze': 62,
 'url': 'http://www.psp.cz/eknih/2017ps/stenprot/062schuz/s062001.htm',
 'cisloHlasovani': 74003,
 'celeJmeno': 'Tomio Okamura',
 'narozeni': None,
 'HsProcessType': 'person',
 'OsobaId': 'tomio-okamura',
 'funkce': 'Místopředseda PSP',
 'tema': 'Slib poslance',
 'text': 'Takže i já mezi námi vítám pana poslance Jaroslava Vymazala a popřál bych mu jménem Poslanecké sněmovny v\xa0poslanecké práci hodně úspěchů.\nA nyní přistoupíme k\xa0určení dvou ověřovatelů této schůze. Navrhuji, abychom určili poslankyni Markétu Pekarovou Adamovou a poslance Františka Kopřivu. Má někdo jiný návrh? Jiný návrh nevidím.\nZahajuji hlasování. Kdo je pro? Kdo je proti?\nHlasování číslo 1. Přihlášeno 90 poslanců, pro 83, proti žádný. Konstatuji, že jsme ověřovateli 62. schůze Poslanecké sněmovny určili poslankyni Markétu Pekarovou Adamovou a poslance Františka Kopřivu. ***\nNeautorizováno !\nKonstatu

In [36]:
# assign an empty list for session tokenized records
psp_records_list = []

In [38]:
# Loop through multiple politics' records
for gathering_number in range(35, 62):
    print(f'Downloading session {gathering_number}...')
    record_number = 2
    response_json = 1

    while response_json and record_number<3000:
        if (record_number == 500) or (record_number == 1000) or (record_number == 1500):
            time.sleep(60)
        record_number+=1
        record_number_str = str(record_number).zfill(5)
        url = 'https://www.hlidacstatu.cz/api/v1/DatasetItem/stenozaznamy-psp/2017_{gathering_number}_{record_number_str}'.format(gathering_number=gathering_number, record_number_str=record_number_str)
        response = requests.get(url, headers={'Authorization': 'Token {token}'.format(token=my_token)})
        response_json = response.json()
        if response_json:
            speaker_text = response_json['text']
            speaker_text_list = gensim.utils.simple_preprocess(speaker_text)
            psp_records_list.append(speaker_text_list)

Downloading session 36...
Downloading session 37...
Downloading session 38...
Downloading session 39...
Downloading session 40...
Downloading session 41...
Downloading session 42...
Downloading session 43...
Downloading session 44...
Downloading session 45...
Downloading session 46...
Downloading session 47...
Downloading session 48...
Downloading session 49...
Downloading session 50...
Downloading session 51...
Downloading session 52...
Downloading session 53...
Downloading session 54...
Downloading session 55...
Downloading session 56...
Downloading session 57...
Downloading session 58...
Downloading session 59...
Downloading session 60...
Downloading session 61...


In [59]:
print(f"The dataset has {len(psp_records_list)} politics' statements.")

The dataset has 19821 politics' statements.


In [None]:
# save the downloaded tokens for future
with open("psp_records_list.csv","w") as f:
    wr = csv.writer(f)
    wr.writerows(psp_records_list)

In [22]:
# Lets train the word2vec model using skipgram
w2v_model_psp = gensim.models.Word2Vec(size=200, window=5, min_count=10, sg=1, workers=10)
w2v_model_psp.build_vocab(psp_records_list) 
w2v_model_psp.train(sentences=psp_records_list, total_examples=len(psp_records_list), epochs=w2v_model_psp.epochs)

(11910765, 14488330)

In [63]:
# Model vocabulary
print(f"Our vocabulary hase {len(w2v_model_psp.wv.vocab)} words.")
w2v_model_psp.wv.vocab

Our vocabulary hase 17469 words.


{'já': <gensim.models.keyedvectors.Vocab at 0x160eafdd8>,
 'vám': <gensim.models.keyedvectors.Vocab at 0x160eaff60>,
 'děkuji': <gensim.models.keyedvectors.Vocab at 0x160eaff28>,
 'dále': <gensim.models.keyedvectors.Vocab at 0x160eaff98>,
 'se': <gensim.models.keyedvectors.Vocab at 0x160eaffd0>,
 'hlásil': <gensim.models.keyedvectors.Vocab at 0x160ec1048>,
 'pan': <gensim.models.keyedvectors.Vocab at 0x160ec1080>,
 'předseda': <gensim.models.keyedvectors.Vocab at 0x160ec10b8>,
 'kalousek': <gensim.models.keyedvectors.Vocab at 0x160ec10f0>,
 'dámy': <gensim.models.keyedvectors.Vocab at 0x160ec1128>,
 'pánové': <gensim.models.keyedvectors.Vocab at 0x160ec1160>,
 'promiňte': <gensim.models.keyedvectors.Vocab at 0x160ec1198>,
 'pokládám': <gensim.models.keyedvectors.Vocab at 0x160ec11d0>,
 'za': <gensim.models.keyedvectors.Vocab at 0x160ec1208>,
 'vhodné': <gensim.models.keyedvectors.Vocab at 0x160ec1240>,
 'aby': <gensim.models.keyedvectors.Vocab at 0x160ec1278>,
 'poslanecké': <gensim.mo

In [64]:
# Create TSNE representation of N words - for visualization check of algorithms output
N = 1000
wanted_vocab = random.sample(list(w2v_model_psp.wv.vocab), N)
X = w2v_model_psp[wanted_vocab] # X is an array of word vectors, each vector containing 150 tokens
tsne_model = TSNE(perplexity=40, n_components=2, init="pca", n_iter=5000, random_state=23)
Y = tsne_model.fit_transform(X)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [65]:
# Plot the TSNE
trace = go.Scatter(
    x = Y[:,0],
    y = Y[:,1],
    text = list(wanted_vocab),
    mode='text'
)

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

In [76]:
# Find similar word 1
words1 = ['děkuji']
w2v_model_psp.wv.most_similar(positive=words1, topn=10)

[('děkuju', 0.6708879470825195),
 ('děkujeme', 0.6382201910018921),
 ('poděkuji', 0.6197159886360168),
 ('vojtěchu', 0.6106910705566406),
 ('zahradníkovi', 0.6092108488082886),
 ('poledne', 0.6054816842079163),
 ('patriku', 0.6034395098686218),
 ('kaňkovskému', 0.6012445688247681),
 ('narozeninám', 0.6010822057723999),
 ('jiřímu', 0.590459942817688)]