In [249]:
import logging
import pandas as pd
import multiprocessing

from re import sub
from time import time 
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [250]:
def text_to_word_list(text):
    text = text.split()

    return text  

In [251]:
file = pd.read_csv("Cleaned_dataset.csv")

file_split = file.copy()

In [252]:
file_split.tweet = file_split.tweet.apply(lambda x: text_to_word_list(x))

In [253]:
file_model = file_split.copy()
file_model = file_model[file_model.tweet.str.len()>1]
print(file_model['tweet'])

0       [soal, jalan, jatibaru, polisi, tidak, bisa, g...
1       [sama, cewek, lho, kayak, harus, bisa, lebih, ...
2       [kepingin, gudeg, mbarek, bu, hj, amad, foto, ...
3       [jalan, jatibaru, bagi, dari, wilayah, tn, aba...
4       [sharing, alam, aja, kemarin, jam, 18, 00, bat...
                              ...                        
4396    [tahu, kamu, bahwa, saat, itu, papa, mejam, ma...
4397    [sulit, tetap, calon, wapresnya, jokowi, di, p...
4398    [5, masa, depan, tidak, jelas, lha, iya, giman...
4399    [dulu, benar, ada, mahasiswa, teknik, ui, nemb...
4400    [ya, allah, hanya, engkau, yang, tahu, rasa, s...
Name: tweet, Length: 4401, dtype: object


In [254]:
sent = [row for row in file_model.tweet]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[0]

INFO - 16:20:40: collecting all words and their counts
INFO - 16:20:40: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 16:20:40: collected 95602 token types (unigram + bigrams) from a corpus of 123605 words and 4401 sentences
INFO - 16:20:40: merged Phrases<95602 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 16:20:40: Phrases lifecycle event {'msg': 'built Phrases<95602 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000> in 0.08s', 'datetime': '2023-06-22T16:20:40.112368', 'gensim': '4.3.1', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
INFO - 16:20:40: exporting phrases from Phrases<95602 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 16:20:40: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<4165 phrases, min_count=1, threshold=10.0> from Phrases<95602 vocab, min_count=1, threshold=10.0, max_

['soal',
 'jalan_jatibaru',
 'polisi',
 'tidak',
 'bisa',
 'gertak',
 'gubernur',
 'emangny',
 'polisi',
 'tidak',
 'ikut',
 'pmbhasan',
 'jangan',
 'politik',
 'atur_wilayah',
 'hak',
 'gubernur',
 'soal',
 'tn_abang',
 'soal',
 'turun',
 'turun',
 'pelik',
 'perlu',
 'sabar']

In [255]:
w2v_model = Word2Vec(min_count=2,
                     window=4,
                     vector_size=100,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 16:20:40: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.03>', 'datetime': '2023-06-22T16:20:40.192368', 'gensim': '4.3.1', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}
INFO - 16:20:40: collecting all words and their counts
INFO - 16:20:40: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:20:40: collected 17294 word types from a corpus of 111917 raw words and 4401 sentences
INFO - 16:20:40: Creating a fresh vocabulary
INFO - 16:20:40: Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 8710 unique words (50.36% of original 17294, drops 8584)', 'datetime': '2023-06-22T16:20:40.271368', 'gensim': '4.3.1', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
INFO - 16:20:40: Word2Vec lifecycle eve

Time to build vocab: 0.0 mins


In [256]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 16:20:40: Word2Vec lifecycle event {'msg': 'training model with 11 workers on 8710 vocabulary and 100 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4 shrink_windows=True', 'datetime': '2023-06-22T16:20:40.339393', 'gensim': '4.3.1', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
INFO - 16:20:40: EPOCH 0: training on 111917 raw words (28049 effective words) took 0.1s, 460993 effective words/s
INFO - 16:20:40: EPOCH 1: training on 111917 raw words (28187 effective words) took 0.1s, 444144 effective words/s
INFO - 16:20:40: EPOCH 2: training on 111917 raw words (28075 effective words) took 0.1s, 424081 effective words/s
INFO - 16:20:40: EPOCH 3: training on 111917 raw words (28271 effective words) took 0.1s, 444319 effective words/s
INFO - 16:20:40: EPOCH 4: training on 111917 raw words (28154 effective words) took 0.1s, 429487 effective words/s
INFO - 16:20:40: EPO

Time to train the model: 0.07 mins


In [257]:
w2v_model.save("word2vec.model")

INFO - 16:20:44: Word2Vec lifecycle event {'fname_or_handle': 'word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-06-22T16:20:44.387406', 'gensim': '4.3.1', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'saving'}
INFO - 16:20:44: not storing attribute cum_table
INFO - 16:20:44: saved word2vec.model
