In [1]:
import pandas as pd
import numpy as np
import gensim
import warnings
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
dataset = pd.read_csv("News_dataset_preprocessed.csv")
articles = [w for w in dataset["text"]]
tokenized_words = [gensim.utils.simple_preprocess(str(w)) for w in articles]

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_words)

In [4]:
tokenizer.word_index

{'said': 1,
 'trump': 2,
 'would': 3,
 'president': 4,
 'us': 5,
 'state': 6,
 'people': 7,
 'one': 8,
 'republican': 9,
 'year': 10,
 'also': 11,
 'new': 12,
 'reuters': 13,
 'government': 14,
 'donald': 15,
 'house': 16,
 'clinton': 17,
 'time': 18,
 'obama': 19,
 'could': 20,
 'say': 21,
 'united': 22,
 'told': 23,
 'like': 24,
 'american': 25,
 'white': 26,
 'election': 27,
 'country': 28,
 'right': 29,
 'campaign': 30,
 'party': 31,
 'two': 32,
 'last': 33,
 'news': 34,
 'official': 35,
 'first': 36,
 'washington': 37,
 'former': 38,
 'make': 39,
 'even': 40,
 'law': 41,
 'group': 42,
 'get': 43,
 'hillary': 44,
 'many': 45,
 'security': 46,
 'day': 47,
 'national': 48,
 'week': 49,
 'made': 50,
 'may': 51,
 'vote': 52,
 'want': 53,
 'political': 54,
 'court': 55,
 'police': 56,
 'million': 57,
 'since': 58,
 'know': 59,
 'percent': 60,
 'going': 61,
 'support': 62,
 'think': 63,
 'image': 64,
 'back': 65,
 'take': 66,
 'way': 67,
 'leader': 68,
 'presidential': 69,
 'bill': 70,
 

In [5]:
def vocab_embedding(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1

    embedding_matrix_vocab = np.zeros((vocab_size,
                                       embedding_dim))
  
    with open(filepath, encoding = "utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(vector, dtype = np.float32)[:embedding_dim]
  
    return embedding_matrix_vocab

In [6]:
embedding_dim = 50
embedding_matrix_vocab = vocab_embedding('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

In [7]:
vectors = []
for i in range(0, len(tokenized_words)):
    article_vector = []
    for j in range(0, len(tokenized_words[i])):
        word = tokenized_words[i][j]
        word_index = tokenizer.word_index[word]
        word_vector = embedding_matrix_vocab[word_index]
        article_vector.append(word_vector)
    vectors.append(article_vector)

In [8]:
# saving the vectors to disk
pickle_out = open("glove_vectors.pickle", "wb")
pickle.dump(vectors, pickle_out)
pickle_out.close()