### Load Data

In [1]:
import pandas as pd
import sqlite3
import pprint
import nltk
import re
import pickle
import multiprocessing
import gensim.models.word2vec as w2v
from tqdm import tqdm

In [2]:
conn = sqlite3.connect('messages_info.db')
query = 'SELECT * FROM data'
data = pd.read_sql(query, con=conn)

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cagli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cagli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
tokenizer = nltk.data.load('file:\\Users\\cagli\\AppData\\Roaming\\nltk_data\\tokenizers\\punkt\\english.pickle')

In [5]:
corpus_raw = ''
for string in tqdm(data.message.tolist()):
    corpus_raw = corpus_raw + string

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26216/26216 [00:02<00:00, 9558.49it/s]


### Preprocessing

In [6]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [7]:
def sentence_to_wordlist(string):
    clean = re.sub('[^a-zA-Z]', ' ', string)
    words = clean.split()
    return words

In [8]:
sentences = []
for raw_sentence in tqdm(raw_sentences):
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15303/15303 [00:00<00:00, 61365.04it/s]


In [9]:
token_count = sum([len(sentence) for sentence in tqdm(sentences)])
print('In total, the messages contain {0:,} tokens'.format(token_count))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15303/15303 [00:00<00:00, 2297753.06it/s]

In total, the messages contain 622,622 tokens





### Word2Vec

In [10]:
num_features = 300
min_word_count = 3
num_workers = multiprocessing.cpu_count()
context_size = 7
downsampling = 1e-3
seed = 1

In [18]:
messages2vec = w2v.Word2Vec(
    sg=0,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling,
)

In [19]:
messages2vec.build_vocab(sentences)

In [20]:
messages2vec.wv.most_similar('Weather')

[('ASKED', 0.21404433250427246),
 ('marginal', 0.19698795676231384),
 ('shoots', 0.19423317909240723),
 ('Guinand', 0.18869811296463013),
 ('heartbreaking', 0.18761038780212402),
 ('enforced', 0.1843990534543991),
 ('wards', 0.18439701199531555),
 ('send', 0.18113936483860016),
 ('woke', 0.17987456917762756),
 ('protest', 0.17786851525306702)]

In [21]:
print('Word2Vec vocabulary length:', len(messages2vec.wv.vocab))

Word2Vec vocabulary length: 13347


In [22]:
epochs = 30
alpha_steps = (messages2vec.alpha - messages2vec.min_alpha) / epochs

for epoch in tqdm(range(1,epochs+1)):
    messages2vec.train(sentences, total_examples=messages2vec.corpus_count, epochs=1)
    messages2vec.alpha -= alpha_steps
    assert messages2vec.alpha > 0

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:11<00:00,  2.58it/s]


In [23]:
messages2vec.wv.most_similar('Weather')

[('depression', 0.9650245904922485),
 ('gusts', 0.9600605964660645),
 ('downpour', 0.9591271877288818),
 ('Zambia', 0.959057092666626),
 ('exceptionally', 0.9578821063041687),
 ('thunderstorm', 0.9563523530960083),
 ('sudden', 0.9556964635848999),
 ('smog', 0.9551289081573486),
 ('consequent', 0.9527643918991089),
 ('Typhoon', 0.9527196884155273)]