## Reading the preprocessed files

In [2]:
import pandas as pd

clean_train_df = pd.read_csv('../storage/datasets/q2b/preprocessed/train_quora_clean.csv')
clean_test_df = pd.read_csv('../storage/datasets/q2b/preprocessed/test_quora_clean.csv')

## Creating the embedding matrix

In [3]:
def form_embedding_matrix(clean_train_df,clean_test_df,columns):
    import gensim.downloader
    import numpy as np
    import pandas as pd
    import nltk
    nltk.download('stopwords')

    quora_w2v = gensim.downloader.load('word2vec-google-news-300')
    google_news_emb_dim = 300

    vocabulary = dict()
    inverse_vocabulary = ['<unk>']
    stop_words = set(stopwords.words('english'))

    def text_to_vec(df, w2v, vocabulary, inverse_vocabulary):
        numb_represantations = []
        for index, row in df.iterrows():
            questions = []
            for question in columns:
                q2n = []
                for word in row.loc[question].split():
                    # Stopwords have not yet been removed since they might be included in the pretrained word2vec
                    if word in stop_words and word not in w2v.vocab:
                        continue
                        
                    if word not in vocabulary:
                        vocabulary[word] = len(inverse_vocabulary)
                        q2n.append(len(inverse_vocabulary))
                        inverse_vocabulary.append(word)
                    else:
                        q2n.append(vocabulary[word])
                questions.append(q2n)
            numb_represantations.append(questions)
        
        return numb_represantations, vocabulary, inverse_vocabulary

    numb_represantation_train, vocabulary, inverse_vocabulary = text_to_vec(clean_train_df, quora_w2v, vocabulary, inverse_vocabulary)
    numb_represantation_test, vocabulary, inverse_vocabulary = text_to_vec(clean_test_df, quora_w2v, vocabulary, inverse_vocabulary)

    embedding_dim = 300
    embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)
    embeddings[0] = 0

    # Build the embedding matrix
    for word, index in vocabulary.items():
        if word in quora_w2v.vocab:
            embeddings[index] = quora_w2v.word_vec(word)

    return embeddings, numb_represantation_train, numb_represantation_test

embeddings, numb_represantation_train, numb_represantation_test = form_embedding_matrix(clean_train_df,clean_test_df,['Q1','Q2'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mikev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Storing the embedding matrix and number represantations

In [5]:
# Save the embeddings to disk
np.save('../storage/datasets/q2b/word_embeddings/embeddings_matrix.npy', embeddings)

# Save the represantations
with open('../storage/datasets/q2b/word_embeddings/numb_represantation_train.pkl', 'wb') as fp:
    pickle.dump(numb_represantation_train, fp)
with open('../storage/datasets/q2b/word_embeddings/numb_represantation_test.pkl', 'wb') as fp:
    pickle.dump(numb_represantation_test, fp)