## Import Libraries

In [77]:
import pandas as pd
import numpy as np
import re
import tqdm
import random 

from nltk.corpus import stopwords

from gensim.models import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

%matplotlib inline

## Read Cleaned Data

In [78]:
### Read Data General
data_general= pd.read_csv('../data/CleanNonEnglishTweetsFromJktAndBaku.csv')
print(len(data_general))
data_general = data_general[['tweet']]

### Read influencer Data
data_influencer=pd.read_csv('../data/CleanNonEnglishTweetsFromJktAndBaku2.csv')
print(len(data_influencer))
data_influencer = data_influencer[['tweet']]

### Concatenate both
frames = [data_general, data_influencer]
data = pd.concat(frames)

### drop duplicate, take everything but the first 500 rows
print('before: ' + str(len(data)))
data = data.drop_duplicates(keep='first',subset='tweet')
data = data.iloc[500:]
print('after : ' + str(len(data)))


2351520
587272
before: 2938792
after : 2912999


## Create Corpus

In [79]:
%%time


### before
print(data['tweet'].iloc[:2])

### delete all comas, split per sentence, change "@" to "di"
tmp_corpus = data['tweet'].map(lambda x: re.sub("[,]", "", str(x)))
tmp_corpus = tmp_corpus.map(lambda x: re.sub("[@]", "di", str(x)))

### split: split per sentences, split each sentence to list of words
tmp_corpus = [i.split('.') for i in tmp_corpus]
tmp_corpus = [item for sublist in tmp_corpus for item in sublist]
tmp_corpus = [i.split(' ') for i in tmp_corpus]

### delete if a word is an empty string or just a space
tmp_corpus = [[i for i in j if i!='' and i!=' '] for j in tmp_corpus]

### only consider sentence if there are 2 or more words in it
tmp_corpus = [i for i in tmp_corpus if len(i)>1]

### after
print(tmp_corpus[:5])


500    kemaren ada yg di recall kan ,  tp lupa yg man...
501      hubungan rusia dan ukraina semakin meruncing...
Name: tweet, dtype: object
[['kemaren', 'ada', 'yg', 'di', 'recall', 'kan', 'tp', 'lupa', 'yg', 'mana'], ['tktnya', '25', 'itu', 'yg', 'di', 'recall'], ['masih', 'browsing2', 'jg', 'sambil', 'nunggu', 'si', 'pinky', 'laku'], ['semoga', 'yg', 'beli', 'kgk', 'nawar2', 'lg'], ['hubungan', 'rusia', 'dan', 'ukraina', 'semakin', 'meruncing', 'akibat', 'insiden', 'di', 'selat', 'kirch', 'laut', 'hitam']]
Wall time: 47.5 s


In [80]:
### randomize corpus
Corpus = tmp_corpus
random.shuffle(Corpus)

### count how many words and sentences there are
num_of_sentences = len(Corpus)
num_of_words = 0
for line in Corpus:
    num_of_words += len(line)

print('Num of sentences - %s'%(num_of_sentences))
print('Num of words - %s'%(num_of_words))

Num of sentences - 3899922
Num of words - 32070665


## Create Bigram

In [81]:
%%time
phrases = Phrases(sentences=Corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)

for index,sentence in enumerate(Corpus):
    if index%100000==0:
        print(index)
    Corpus[index] = bigram[sentence]

at sentence #2190000, processed 18006999 words and 6091965 word types
2021-01-03 10:38:01,429 : INFO : PROGRESS: at sentence #2200000, processed 18089104 words and 6112458 word types
2021-01-03 10:38:01,557 : INFO : PROGRESS: at sentence #2210000, processed 18170894 words and 6132934 word types
2021-01-03 10:38:01,683 : INFO : PROGRESS: at sentence #2220000, processed 18253144 words and 6153757 word types
2021-01-03 10:38:01,812 : INFO : PROGRESS: at sentence #2230000, processed 18334937 words and 6174459 word types
2021-01-03 10:38:01,939 : INFO : PROGRESS: at sentence #2240000, processed 18417130 words and 6195275 word types
2021-01-03 10:38:02,068 : INFO : PROGRESS: at sentence #2250000, processed 18499182 words and 6215649 word types
2021-01-03 10:38:02,196 : INFO : PROGRESS: at sentence #2260000, processed 18581931 words and 6236372 word types
2021-01-03 10:38:02,323 : INFO : PROGRESS: at sentence #2270000, processed 18664443 words and 6256747 word types
2021-01-03 10:38:02,450 : 

## Train Word2Vec

## Train FastText