In [None]:
import numpy as np
import utils
import spacy
import pandas as pd
import tqdm

In [None]:
train_df = pd.read_csv('data/train.csv')
valid_df = pd.read_csv('data/valid.csv')
test_df = pd.read_csv('data/test.csv')

train_df = train_df[['review_id', 'text', 'stars']][:5]
valid_df = valid_df[['review_id', 'text', 'stars']] [:5]
test_df = test_df[['review_id', 'text']][:5]

print(train_df.head())
print(valid_df.head())
print(test_df.head())

In [None]:
def create_word_embed_vec(corpus):
    maxlen = 128
    embed_features = 96
    nlp = spacy.load('en_core_web_sm')

    corpus_copy = corpus.copy()
    corpus_copy = corpus_copy.map(lambda x: utils.tokenize(x)).map(lambda x: utils.stem(x)).map(lambda x: " ".join(x))

    word_embed_vec = np.zeros(shape=(len(corpus_copy), maxlen, embed_features))

    for outter_index, doc in tqdm.tqdm(enumerate(corpus_copy)):
        doc_vec = np.zeros(shape=(maxlen, embed_features))
        for inner_index, word in enumerate(doc):
            if inner_index > maxlen-1:
                break
            v = nlp(word).vector
            doc_vec[inner_index] = v
        word_embed_vec[outter_index] = doc_vec
    return word_embed_vec
    

In [None]:
train_word_embed_vec = create_word_embed_vec(train_df['text'])
print(train_word_embed_vec.shape)
train_df['word_embed'] = train_word_embed_vec.reshape(5, -1).tolist()
train_df.to_csv('data/train_word2.csv', index=False)

In [None]:
valid_word_embed_vec = create_word_embed_vec(valid_df['text'])
print(valid_word_embed_vec.shape)
valid_df['word_embed'] = valid_word_embed_vec.reshape(5, -1).tolist()
valid_df.to_csv('data/valid_word2.csv', index=False)

In [None]:
test_word_embed_vec = create_word_embed_vec(test_df['text'])
print(test_word_embed_vec.shape)
test_df['word_embed'] = test_word_embed_vec.reshape(5, -1).tolist()
test_df.to_csv('data/test_word2.csv', index=False)