In [102]:
import pandas as pd
import numpy as np
# import tensorflow as tf
import os
from collections import defaultdict, Counter
import re
from tqdm import tqdm
from pyemd import emd
from gensim.models import KeyedVectors
from sklearn.externals import joblib
from nltk.tokenize import word_tokenize

In [None]:
model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin',
                                         binary=True)

In [104]:
class TextReader:
    """
    A class meant to load the text data from 
    files distinctively identifiable for different
    class labels, clean the text and use pretrained
    word vectors to convert into suitable word vectors.
    """
    def __init__(self, data_dir, suffix_labels):
        self.path = data_dir
        self.ranks = None
        self.raw_labeled_data = defaultdict(list)
        self.word_fequency = None
        self.max_text_length = 0
        self.data_files = {}
        self.X = None
        self.y = None
        for file, label in suffix_labels.items():
            if not os.path.exists(os.path.join(data_dir, file)) or not \
            os.path.isfile(os.path.join(data_dir, file)):
                raise IOError(f'Data files are not found in {data_dir}')
            else:
                self.data_files[os.path.join(data_dir, file)] = label
    
    def clean_text(self, text, stopwords):
        """
        Cleaning the text
        """
        text = " ".join(filter(lambda x: all([x.isalpha(), x not in stopwords]), 
                               word_tokenize(text)))
        return text.strip().lower()
    
    def prepare_data(self, clean=True, **kwargs):
        all_words = []
        for file_path, class_label in self.data_files.items():
            lines = []
            with open(file_path, 'r', encoding='latin-1') as infile:
                for line in infile:
                    if not clean:
                        cleaned_line = line
                    else:
                        stopwords = kwargs.get('stopwords', [])
                        cleaned_line = self.clean_text(line, stopwords)

                    lines.append(cleaned_line)
                    tokens = cleaned_line.split()
                    self.max_text_length = max(self.max_text_length, len(tokens))
                    all_words.extend(tokens)
                    self.raw_labeled_data[class_label].append(cleaned_line)
        
        self.word_fequency = Counter(all_words)
        return self.store_ranking(kwargs.get('max_vocab'))
    
    def store_ranking(self, max_vocab=None):
        ranks = [*map(lambda x: x[0], self.word_fequency.most_common(max_vocab))]
        np.save(os.path.join(self.path, 'ranks'), ranks)
        return True
    
    def get_rank(self, token):
        if self.ranks is None:
            self.ranks = np.load(os.path.join(self.path, 'ranks.npy'))
        try:
            return int(np.where(self.ranks == token)[0][0]) + 1
        except IndexError:
            return 0
            
    def get_ranked_features(self):
        if self.X is not None and self.y is not None:
            return self.X, self.y
        X = []
        y = []
        for label, corpus in self.raw_labeled_data.items():
            for doc in tqdm(corpus):
                tokens = doc.split()
                ranks = [self.get_rank(token) for token in tokens]
                pad_left = (self.max_text_length - len(tokens)) // 2
                pad_right = int(np.ceil((self.max_text_length - len(tokens)) / 2.0))
                ranks = np.pad(ranks, pad_width=(pad_left, pad_right), 
                               mode='constant', constant_values=(-1, -1))
                y.append(label)
                X.append(ranks)
        return np.array(X, dtype=int), np.array(y, dtype=int)
    
    def get_embedding_vector(self, model):
        """
        Get the embedding vector from the model.
        We can use pretrained word vectors like Google News.
        """
        for word in self.word_fequency:
            if model.__contains__(word):
                yield word, model[word]
            else:
                yield word, np.random.uniform(-0.25, 0.25, model.vector_size)
        

In [105]:
tr = TextReader(data_dir='./', suffix_labels={'rt-polarity.pos': 1, 'rt-polarity.neg': 0})

In [106]:
tr.data_files

{'./rt-polarity.pos': 1, './rt-polarity.neg': 0}

In [107]:
tr.prepare_data(clean=True)

True

In [97]:
X, y = tr.get_ranked_features()

100%|██████████| 5331/5331 [00:16<00:00, 318.09it/s]
100%|██████████| 5331/5331 [00:16<00:00, 327.16it/s]


In [108]:
wv = tr.get_embedding_vector(model)

In [110]:
word_vectors = {}
for word, vector in wv:
    word_vectors[word] = vector

In [114]:
word_vectors_df = pd.DataFrame.from_dict(word_vectors, orient='index')

In [115]:
word_vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
the,0.080078,0.10498,0.049805,0.053467,-0.067383,-0.120605,0.035156,-0.118652,0.043945,0.030151,...,-0.071289,-0.030151,-0.013,0.016357,-0.018311,0.014832,0.005005,0.003662,0.047607,-0.068848
rock,0.106445,0.00592,-0.018433,0.186523,-0.06543,-0.115723,-0.009827,-0.392578,0.01001,0.091797,...,0.097168,0.171875,-0.300781,0.014771,-0.120605,-0.008057,0.071289,0.026367,0.081543,0.091797
is,0.00705,-0.073242,0.171875,0.022583,-0.132812,0.198242,0.112793,-0.10791,0.071777,0.020874,...,-0.233398,-0.036377,-0.09375,0.182617,0.0271,0.12793,-0.02478,0.01123,0.164062,0.106934
destined,0.197266,0.462891,0.004974,0.168945,-0.040527,0.053467,0.212891,-0.09668,0.090332,0.3125,...,-0.031738,-0.112305,0.045166,-0.012573,0.291016,-0.078125,-0.103516,-0.206055,0.195312,-0.136719
to,-0.047481,-0.14257,-0.067143,-0.128437,-0.142369,-0.221884,-0.118976,0.207664,0.056873,0.023665,...,0.038359,-0.100123,0.022912,-0.199677,0.00107,-0.07889,-0.024419,0.075134,0.204857,-0.236962


In [116]:
word_vectors_df.to_csv('./word_vectors.csv', index=True)

In [117]:
!ls

CNN_4_RNN.ipynb  ranks.npy  rt-polarity.neg  word_vectors.csv
nohup.out	 README.md  rt-polarity.pos
