In [1]:
import gensim, logging
from gensim.models.doc2vec import TaggedDocument 
from gensim.models import Doc2Vec
import random
import re 
import os 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
gmodel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 

2022-01-27 14:50:28,669 : INFO : loading projection weights from GoogleNews-vectors-negative300.bin
2022-01-27 14:50:54,436 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2022-01-27T14:50:54.428210', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22538-SP0', 'event': 'load_word2vec_format'}


In [4]:
gmodel['cat']

array([ 0.0123291 ,  0.20410156, -0.28515625,  0.21679688,  0.11816406,
        0.08300781,  0.04980469, -0.00952148,  0.22070312, -0.12597656,
        0.08056641, -0.5859375 , -0.00445557, -0.296875  , -0.01312256,
       -0.08349609,  0.05053711,  0.15136719, -0.44921875, -0.0135498 ,
        0.21484375, -0.14746094,  0.22460938, -0.125     , -0.09716797,
        0.24902344, -0.2890625 ,  0.36523438,  0.41210938, -0.0859375 ,
       -0.07861328, -0.19726562, -0.09082031, -0.14160156, -0.10253906,
        0.13085938, -0.00346375,  0.07226562,  0.04418945,  0.34570312,
        0.07470703, -0.11230469,  0.06738281,  0.11230469,  0.01977539,
       -0.12353516,  0.20996094, -0.07226562, -0.02783203,  0.05541992,
       -0.33398438,  0.08544922,  0.34375   ,  0.13964844,  0.04931641,
       -0.13476562,  0.16308594, -0.37304688,  0.39648438,  0.10693359,
        0.22167969,  0.21289062, -0.08984375,  0.20703125,  0.08935547,
       -0.08251953,  0.05957031,  0.10205078, -0.19238281, -0.09

In [5]:
gmodel['dog']

array([ 5.12695312e-02, -2.23388672e-02, -1.72851562e-01,  1.61132812e-01,
       -8.44726562e-02,  5.73730469e-02,  5.85937500e-02, -8.25195312e-02,
       -1.53808594e-02, -6.34765625e-02,  1.79687500e-01, -4.23828125e-01,
       -2.25830078e-02, -1.66015625e-01, -2.51464844e-02,  1.07421875e-01,
       -1.99218750e-01,  1.59179688e-01, -1.87500000e-01, -1.20117188e-01,
        1.55273438e-01, -9.91210938e-02,  1.42578125e-01, -1.64062500e-01,
       -8.93554688e-02,  2.00195312e-01, -1.49414062e-01,  3.20312500e-01,
        3.28125000e-01,  2.44140625e-02, -9.71679688e-02, -8.20312500e-02,
       -3.63769531e-02, -8.59375000e-02, -9.86328125e-02,  7.78198242e-03,
       -1.34277344e-02,  5.27343750e-02,  1.48437500e-01,  3.33984375e-01,
        1.66015625e-02, -2.12890625e-01, -1.50756836e-02,  5.24902344e-02,
       -1.07421875e-01, -8.88671875e-02,  2.49023438e-01, -7.03125000e-02,
       -1.59912109e-02,  7.56835938e-02, -7.03125000e-02,  1.19140625e-01,
        2.29492188e-01,  

In [6]:
gmodel['spatula']

array([-0.19140625, -0.04296875,  0.27539062,  0.00488281, -0.3203125 ,
        0.08203125,  0.05566406, -0.03613281, -0.31445312,  0.10693359,
       -0.359375  ,  0.29882812,  0.02331543,  0.05517578, -0.140625  ,
        0.1953125 , -0.23632812, -0.22167969, -0.06542969, -0.3359375 ,
        0.25195312, -0.09326172,  0.54296875,  0.11328125, -0.28710938,
       -0.12011719, -0.11181641,  0.20996094, -0.33203125,  0.30273438,
       -0.3359375 , -0.12255859,  0.12890625, -0.28515625, -0.04223633,
        0.25585938,  0.3203125 ,  0.07177734,  0.19042969, -0.01379395,
        0.16992188, -0.22460938,  0.5078125 ,  0.08398438, -0.07519531,
       -0.06396484,  0.05371094,  0.34570312,  0.46289062, -0.16699219,
       -0.30664062,  0.15234375, -0.09765625, -0.26171875, -0.14160156,
        0.2265625 ,  0.49609375, -0.10791016, -0.08447266,  0.234375  ,
        0.04931641, -0.07128906,  0.05273438, -0.11914062,  0.09814453,
        0.11181641, -0.13574219, -0.46875   ,  0.26171875,  0.12

In [7]:
gmodel.similarity('cat', 'dog')

0.7609457

In [8]:
gmodel.similarity('cat','spatula')

0.12412614

In [9]:
def ekstrak_kata(terkirim):
    terkirim = terkirim.lower()
    terkirim = re.sub(r'<[^>]+>',' ', terkirim) # strip html tags 
    terkirim = re.sub(r'(\w)\'(\w)', '\1\2', terkirim) # remove apostrophes
    terkirim = re.sub(r'\W',' ', terkirim) # remove punctuation
    terkirim = re.sub(r'\s+',' ', terkirim)# remove repeated spaces 
    terkirim = terkirim.strip()
    return terkirim.split()

In [10]:
# unsupervised training data 
unsup_sentences = []
# link: http://ai.stanford.edu/~amaas/data/sentiment/, data mentah dari folder IMDB
for dirname in ["train/pos", "train/neg", "train/unsup", "test/pos", "test/neg"] :
    for fname in sorted(os.listdir("aclImdb/" + dirname)):
        if fname[-4:] == '.txt':
            with open("aclImdb/" + dirname + "/"+ fname, encoding = 'UTF-8') as f: 
                terkirim = f.read()
                words = ekstrak_kata(terkirim)
                unsup_sentences.append(TaggedDocument(words, [dirname + "/" + fname])) 
# link: http://w.cs.cormeld.edu/people/pabo/movie-review-data 
for dirname in ["txt_sentoken/pos", "txt_sentoken/neg"]:
    for fname in sorted(os.listdir(dirname)) :
        if fname[-4:] == '.txt':
            with open(dirname + '/' +  fname, encoding = 'UTF-8') as f:
                for i, terkirim in enumerate(f):
                    words = ekstrak_kata(terkirim)
                    unsup_sentences.append(TaggedDocument(words, ["%s/%s-%d" % (dirname, fname, i)])) 
#link: https://nip.Stanford.edu/sentiment/, data mentah dari folder Rotten Tomatoes 
with open("stanfordSentimentTreebank/original_rt_snippets.txt", encoding = 'UTF-8') as f:
    for i, line in enumerate(f): 
        words =  ekstrak_kata(terkirim) 
        unsup_sentences.append(TaggedDocument (words, ["rt-%d" % i]))

In [11]:
len(unsup_sentences) 

175325

In [12]:
unsup_sentences[0:1]

[TaggedDocument(words=['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', 'such', 'as', 'teachers', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'hig', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', 'the', 'scramble', 'to', 'survive', 'financially', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', 'pomp', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', 'i', 'immediately', 'recalled', 'at', 'high', 'a', 'classic', 'line', 'inspector', 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', 'student', 'welcome', 

In [13]:
class PermuteSentences(object):
    def __init__(self, terkirim):
        self.terkirim = terkirim
        
    def __iter__(self):
        shuffled = list(self.terkirim)
        random.shuffle(shuffled)
        for terkirim in shuffled:
            yield terkirim

In [14]:
permuter = PermuteSentences(unsup_sentences) 
model = Doc2Vec(permuter, dm=0, hs=1, vector_size=50)

2022-01-27 14:52:20,560 : INFO : collecting all words and their counts
2022-01-27 14:52:20,710 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-01-27 14:52:21,137 : INFO : PROGRESS: at example #10000, processed 1394634 words (3268441/s), 44924 word types, 10000 tags
2022-01-27 14:52:21,581 : INFO : PROGRESS: at example #20000, processed 2867668 words (3321721/s), 61787 word types, 20000 tags
2022-01-27 14:52:22,038 : INFO : PROGRESS: at example #30000, processed 4271806 words (3079743/s), 73346 word types, 30000 tags
2022-01-27 14:52:22,490 : INFO : PROGRESS: at example #40000, processed 5668510 words (3095865/s), 83033 word types, 40000 tags
2022-01-27 14:52:22,956 : INFO : PROGRESS: at example #50000, processed 7073236 words (3022300/s), 91385 word types, 50000 tags
2022-01-27 14:52:24,054 : INFO : PROGRESS: at example #60000, processed 8487804 words (1289903/s), 98618 word types, 60000 tags
2022-01-27 14:52:24,465 : INFO : PROGRESS: at example #70

2022-01-27 14:53:04,472 : INFO : worker thread finished; awaiting finish of 1 more threads
2022-01-27 14:53:04,473 : INFO : worker thread finished; awaiting finish of 0 more threads
2022-01-27 14:53:04,473 : INFO : EPOCH - 1 : training on 24693510 raw words (18753183 effective words) took 31.5s, 595837 effective words/s
2022-01-27 14:53:05,502 : INFO : EPOCH 2 - PROGRESS: at 2.78% examples, 508444 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:53:06,524 : INFO : EPOCH 2 - PROGRESS: at 5.93% examples, 539799 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:53:07,528 : INFO : EPOCH 2 - PROGRESS: at 9.03% examples, 553222 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:53:08,534 : INFO : EPOCH 2 - PROGRESS: at 12.12% examples, 559453 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:53:09,541 : INFO : EPOCH 2 - PROGRESS: at 15.17% examples, 562334 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:53:10,546 : INFO : EPOCH 2 - PROGRESS: at 18.62% examples, 575205 words/s, in_qsize 6, out_qsize 0
202

2022-01-27 14:54:10,116 : INFO : EPOCH 4 - PROGRESS: at 14.82% examples, 546073 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:54:11,127 : INFO : EPOCH 4 - PROGRESS: at 17.80% examples, 551255 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:54:12,140 : INFO : EPOCH 4 - PROGRESS: at 21.25% examples, 565350 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:54:13,152 : INFO : EPOCH 4 - PROGRESS: at 24.65% examples, 574883 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:54:14,166 : INFO : EPOCH 4 - PROGRESS: at 28.21% examples, 583059 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:54:15,172 : INFO : EPOCH 4 - PROGRESS: at 31.63% examples, 589089 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:54:16,176 : INFO : EPOCH 4 - PROGRESS: at 34.93% examples, 594182 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:54:17,178 : INFO : EPOCH 4 - PROGRESS: at 38.40% examples, 598718 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:54:18,180 : INFO : EPOCH 4 - PROGRESS: at 41.73% examples, 600860 words/s, in_qsiz

2022-01-27 14:55:17,392 : INFO : EPOCH 6 - PROGRESS: at 34.63% examples, 588287 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:55:18,413 : INFO : EPOCH 6 - PROGRESS: at 38.11% examples, 592445 words/s, in_qsize 6, out_qsize 1
2022-01-27 14:55:19,421 : INFO : EPOCH 6 - PROGRESS: at 41.64% examples, 595798 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:55:20,433 : INFO : EPOCH 6 - PROGRESS: at 45.10% examples, 599659 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:55:21,439 : INFO : EPOCH 6 - PROGRESS: at 48.53% examples, 602587 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:55:22,453 : INFO : EPOCH 6 - PROGRESS: at 52.06% examples, 605526 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:55:23,464 : INFO : EPOCH 6 - PROGRESS: at 55.42% examples, 606493 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:55:24,473 : INFO : EPOCH 6 - PROGRESS: at 58.88% examples, 608173 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:55:25,479 : INFO : EPOCH 6 - PROGRESS: at 62.40% examples, 610525 words/s, in_qsiz

2022-01-27 14:56:25,329 : INFO : EPOCH 8 - PROGRESS: at 56.97% examples, 589857 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:56:26,333 : INFO : EPOCH 8 - PROGRESS: at 60.41% examples, 592065 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:56:27,347 : INFO : EPOCH 8 - PROGRESS: at 63.79% examples, 594506 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:56:28,350 : INFO : EPOCH 8 - PROGRESS: at 67.09% examples, 595284 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:56:29,364 : INFO : EPOCH 8 - PROGRESS: at 70.40% examples, 595235 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:56:30,377 : INFO : EPOCH 8 - PROGRESS: at 73.80% examples, 596465 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:56:31,378 : INFO : EPOCH 8 - PROGRESS: at 77.06% examples, 597049 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:56:32,378 : INFO : EPOCH 8 - PROGRESS: at 80.20% examples, 597656 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:56:33,389 : INFO : EPOCH 8 - PROGRESS: at 83.55% examples, 598499 words/s, in_qsiz

2022-01-27 14:57:32,763 : INFO : EPOCH 10 - PROGRESS: at 74.05% examples, 598107 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:57:33,779 : INFO : EPOCH 10 - PROGRESS: at 77.38% examples, 598909 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:57:34,782 : INFO : EPOCH 10 - PROGRESS: at 80.76% examples, 599735 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:57:35,791 : INFO : EPOCH 10 - PROGRESS: at 84.04% examples, 600277 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:57:36,796 : INFO : EPOCH 10 - PROGRESS: at 87.51% examples, 601651 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:57:37,798 : INFO : EPOCH 10 - PROGRESS: at 90.89% examples, 602784 words/s, in_qsize 5, out_qsize 0
2022-01-27 14:57:38,806 : INFO : EPOCH 10 - PROGRESS: at 94.40% examples, 604268 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:57:39,809 : INFO : EPOCH 10 - PROGRESS: at 97.77% examples, 605404 words/s, in_qsize 6, out_qsize 0
2022-01-27 14:57:40,422 : INFO : worker thread finished; awaiting finish of 2 more threa

In [15]:
model.save('reviews.d2v')
# menyimpan model dalam bentuk *.d2v, berfungsi untuk digunakan kembali : model = Doc2Vec.load('reviews.d2v') 

2022-01-27 14:57:40,451 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'reviews.d2v', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-01-27T14:57:40.451503', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22538-SP0', 'event': 'saving'}
2022-01-27 14:57:40,454 : INFO : not storing attribute cum_table
2022-01-27 14:57:41,046 : INFO : saved reviews.d2v


In [16]:
model.infer_vector(ekstrak_kata("This place is not worth your time, let alone Vegas.")) 

array([-0.21008383, -0.09705748,  0.03777231,  0.03344745,  0.29576245,
        0.27663946,  0.21059437,  0.13903672,  0.1214092 ,  0.8699159 ,
       -0.31931582, -0.3278709 , -0.49732885,  0.00762915, -0.28329846,
        0.03643729, -0.02731987, -0.30725703,  0.41931066,  0.2034981 ,
        0.2524002 ,  0.12337118,  0.12887421,  0.01535648, -0.04059077,
        0.24113612, -0.1707031 , -0.17841776, -0.15252005,  0.5027428 ,
        0.38536066, -0.2380162 ,  0.23833673, -0.2370381 ,  0.02904741,
       -0.27599928,  0.47388935, -0.12521774,  0.04452138,  0.13457885,
       -0.04406448, -0.2555967 , -0.2238834 , -0.22984448,  0.11596889,
       -0.12569173, -0.16880333,  0.06180951,  0.39734617,  0.11797499],
      dtype=float32)

In [17]:
cosine_similarity(
    [model.infer_vector(ekstrak_kata("This place is not worth your time, let alone Vegas."))],
    [model.infer_vector(ekstrak_kata("Service sucks."))])

array([[0.40617573]], dtype=float32)

In [18]:
cosine_similarity(
    [model.infer_vector(ekstrak_kata("Highly recommended."))],
    [model.infer_vector(ekstrak_kata("Service sucks."))])

array([[0.25987488]], dtype=float32)

In [19]:
sentences = []
sentvecs = []
sentiments = []
for fname in ["yelp", "amazon_cells", "imdb"]: 
    with open("sentiment labelled sentences/%s_labelled.txt" % fname, encoding='UTF-8') as f:
        for i, line in enumerate(f):
            line_split = line.strip().split('\t')
            sentences.append(line_split[0])
            words = ekstrak_kata(line_split[0])
            sentvecs.append(model.infer_vector(words, epochs=10)) # create a vector for this document
            sentiments.append(int(line_split[1]))
            
# membuat fungsi acak dengan kombinasi setences, setvecs, dam stiments
combined = list(zip(sentences, sentvecs, sentiments))
random.shuffle(combined)
sentences, sentvecs, sentiments = zip(*combined)

In [20]:
clf = KNeighborsClassifier(n_neighbors=9)
clfrf = RandomForestClassifier()

In [21]:
skor = cross_val_score(clf, sentvecs, sentiments, cv=5)
np.mean(skor), np.std(skor) 

(0.7863333333333333, 0.020477630071210232)

In [22]:
skor = cross_val_score(clfrf, sentvecs, sentiments, cv=5)
np.mean(skor), np.std(skor)

(0.7899999999999999, 0.014337208778404385)

In [23]:
# kompile pembandingan kata dengan RadomForestClassifier
pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), RandomForestClassifier())

In [24]:
skor = cross_val_score(clfrf, sentvecs, sentiments, cv=5)
np.mean(skor), np.std(skor)

(0.7966666666666667, 0.01773258143769383)