In [2]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))

documents = dataset.data

print(len(documents))

documents[1]



11314


"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [3]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

def clean_text(d):
    pattern = r'[^a-zA-Z\s]'
    text = re.sub(pattern,'',d)
    return text
def clean_stopword(d):
    stop_words = stopwords.words('english')
    return ' '.join(w.lower() for w in d.split() if w not in stop_words and len(w)>3)

def tokenize(d):
    return word_tokenize(d)


    



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bitcamp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bitcamp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
import pandas as pd

news_df = pd.DataFrame({'article':documents})
len(news_df)

11314

In [5]:
news_df.replace("", float("NaN"), inplace=True)
news_df.dropna(inplace=True)
len(news_df)

11096

In [6]:
news_df['article'] = news_df['article'].apply(clean_text)
news_df['article']


0        Well im not sure about the story nad it did se...
1        \n\n\n\n\n\n\nYeah do you expect people to rea...
2        Although I realize that principle is not one o...
3        Notwithstanding all the legitimate fuss about ...
4        Well I will have to change the scoring on my p...
                               ...                        
11309    Danny Rubenstein an Israeli journalist will be...
11310                                                   \n
11311    \nI agree  Home runs off Clemens are always me...
11312    I used HP DeskJet with Orange Micros Grappler ...
11313                                          \nNo arg...
Name: article, Length: 11096, dtype: object

In [7]:
news_df['article'] = news_df['article'].apply(clean_stopword)
news_df['article']



0        well sure story seem biased what disagree stat...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: article, Length: 11096, dtype: object

In [8]:
tokenized_news = news_df['article'].apply(tokenize)
tokenized_news = tokenized_news.to_list()

In [9]:
import numpy as np

drop_news = [index for index, sentence in enumerate(tokenized_news) if len(sentence) <= 1]
news_texts = np.delete(tokenized_news, drop_news, axis=0)
print(len(news_texts))

10945


  return array(a, dtype, copy=False, order=order)


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

news_2000 = news_texts[:2000]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(news_2000)

idx2word = {value:key for key, value in tokenizer.word_index.items()}
sequences = tokenizer.texts_to_sequences(news_2000)



In [11]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

29769


In [12]:
print(sequences[1])

[1263, 457, 2, 60, 119, 419, 61, 1374, 22, 69, 3498, 397, 6874, 412, 1173, 373, 2256, 458, 59, 12478, 458, 1900, 3850, 397, 22, 10, 4325, 8749, 177, 303, 136, 154, 664, 12479, 316, 12480, 15, 12481, 4, 790, 12482, 12483, 4917, 8750]


In [13]:
from tensorflow.keras.preprocessing.sequence import skipgrams

skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in sequences[:10]]

In [14]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]))
    


inhuman(8747), mqefgcmbprwaqimakjwqzql(25818) -> 0
austria(4324), what(34) -> 1
israels(3496), hyperion(15830) -> 0
letter(663), nilsson(4273) -> 0
government(50), quarrel(7032) -> 0


In [15]:
print(len(skip_grams))
print(len(pairs))
print(len(labels))

10
2420
2420


In [16]:
skip_grams = [skipgrams(seq, vocabulary_size=vocab_size, window_size=10) for seq in sequences]


In [17]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input, Dot
from tensorflow.keras.utils import plot_model

In [18]:
embed_size = 50

In [22]:
def word2vec():
    target_inputs = Input(shape=(1,), dtype='int32')
    target_embedding = Embedding(vocab_size, embed_size)(target_inputs)
    
    context_inputs = Input(shape=(1,), dtype='int32')
    context_embedding = Embedding(vocab_size, embed_size)(context_inputs)
    
    dot_product = Dot(axes=2)([target_embedding, context_embedding])
    dot_product = Reshape((1,), input_shape=(1,1))(dot_product)
    output = Activation('sigmoid')(dot_product)
    
    model = Model(inputs=[target_inputs, context_inputs], outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [23]:
model = word2vec()
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 50)        1488450     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 50)        1488450     ['input_2[0][0]']                
                                                                                              

In [25]:
for epoch in range(1, 11):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)
        
    print('Epoch:', epoch, 'Loss:', loss)

Epoch: 1 Loss: 692.9964980855584
Epoch: 2 Loss: 665.7974811606109
Epoch: 3 Loss: 633.2912906017154
Epoch: 4 Loss: 596.6879737945274
Epoch: 5 Loss: 557.836424717214
Epoch: 6 Loss: 519.0104164713994
Epoch: 7 Loss: 481.7197163195815
Epoch: 8 Loss: 447.2171741516795
Epoch: 9 Loss: 415.839994093054
Epoch: 10 Loss: 387.882243967877


In [27]:
import gensim

f = open('skipgram.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

skipgram = gensim.models.KeyedVectors.load_word2vec_format('skipgram.txt', binary=False)

In [28]:
skipgram.most_similar(positive=['soldier'])

[('indiscriminately', 0.8995411396026611),
 ('booby', 0.8690037131309509),
 ('retalliates', 0.8514440655708313),
 ('ammunitions', 0.8333958983421326),
 ('traps', 0.8318977355957031),
 ('sneak', 0.8028048276901245),
 ('patrols', 0.7218192219734192),
 ('occupying', 0.7067608833312988),
 ('financially', 0.6982219815254211),
 ('israeli', 0.6947730779647827)]

In [29]:
skipgram.most_similar(positive=['world'])

[('elmwood', 0.604546308517456),
 ('communitys', 0.5979717373847961),
 ('diccon', 0.5949234962463379),
 ('tzeghagron', 0.5835188031196594),
 ('argumet', 0.5729897022247314),
 ('nobody', 0.5566871762275696),
 ('inadmissable', 0.5526425242424011),
 ('directed', 0.5524103045463562),
 ('races', 0.5495313405990601),
 ('betraying', 0.5352721214294434)]

In [31]:
def skipgram2cbow(skipgrams):
    cbows = []
    flag = 0
    
    for n in skip_grams:
        temp1 = []
        for t in n:
            if flag==1 :
                flag = 0
                temp1.append(t)
            else:
                flag = 1
                temp2 = []
                for x in t:
                    temp2.append([x[1], x[0]])
                temp1.append(temp2)
            cbows.append(temp1)
    return cbows


            
            

In [32]:
cbows = skipgram2cbow(skip_grams)

In [34]:
pairs, labels = cbows[0][0], cbows[0][1]
for i in range(5):
    print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]))

reports(1012), report(627) -> 1
ruin(12474), sure(64) -> 1
yapilamayacak(14474), report(627) -> 0
shame(3221), reason(203) -> 1
received(387), makes(208) -> 1


In [35]:
print(len(cbows))
print(len(pairs))
print(len(labels))

4000
2420
2420


In [36]:
model = word2vec()
model.summary()
plot_model(model, show_shapes=True, show_layer_names=True)

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 50)        1488450     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 50)        1488450     ['input_4[0][0]']                
                                                                                            

In [40]:
for epoch in range(1, 11):
    loss = 0
    for _, elem in enumerate(cbows):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)
           
    print('Epoch:', epoch, 'Loss:', loss)

Epoch: 1 Loss: 774.9314250665484
Epoch: 2 Loss: 701.2680347991482
Epoch: 3 Loss: 642.014219972807
Epoch: 4 Loss: 593.6540243061663
Epoch: 5 Loss: 553.5183973427438
Epoch: 6 Loss: 520.0043771396104
Epoch: 7 Loss: 491.66066822369976
Epoch: 8 Loss: 467.50972097968975
Epoch: 9 Loss: 446.6312315133209
Epoch: 10 Loss: 428.4711104764431


In [41]:
import gensim

f = open('cbow.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

cbow = gensim.models.KeyedVectors.load_word2vec_format('cbow.txt', binary=False)

In [42]:
cbow.most_similar(positive=['soldier'])


[('rodney', 0.8361040353775024),
 ('telegraph', 0.8254186511039734),
 ('angered', 0.8183834552764893),
 ('retalliates', 0.8049345016479492),
 ('miraculously', 0.8015758991241455),
 ('alleen', 0.8004842400550842),
 ('indiscriminately', 0.7989708185195923),
 ('mqwuwtwwuwwuwatweuwae', 0.7916582822799683),
 ('pubamigaincomingimagine', 0.7880551815032959),
 ('grandson', 0.7875582575798035)]

In [43]:
cbow.most_similar

<bound method WordEmbeddingsKeyedVectors.most_similar of <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x000001F5E741F518>>