In [1]:
import nltk
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import pandas as pd
import numpy as np
from tqdm import tqdm
import os, re

In [2]:
files = ['positive', 'negative']
categories = ['books', 'dvd', 'kitchen', 'electronics']
characters_to_remove = '!()#@~"'
pattern = "[" + characters_to_remove + "]"

In [3]:
def read_file(category, file):
    words = []
    docs = []
    tags = []
    with open('./data/SA/'+category+'/'+file+'.review', encoding='ISO-8859-1') as file:
        for line in file:
            words_and_freq = re.sub(pattern, "", line.strip('\n').strip('\x1a')).split('label:')
            if words_and_freq[-1] == 'positive':
                tag = int(1)
            else:
                tag = int(0)
            tags.append(tag)
            words_and_freq = words_and_freq[0].split(' ')
            d = {}
            for term in words_and_freq[:-1]:
                split = term.split(':')
                words.append(split[0])
                x = {split[0]:int(split[1])}
                d.update(x)
            docs.append(d)
    return [words, docs, tags]

def build_dataset(category):
    dictionary = []
    documents = []
    tags = []
    for file in files:
        [words_temp, docs_temp, tags_temp] = read_file(category, file)
        words_temp = np.unique(words_temp)
        for term in zip(docs_temp, tags_temp):
            documents.append(term[0])
            tags.append(term[1])
        for word in words_temp:
            dictionary.append(word)
    temp, unlabeled_docs, unlabeled_tags = read_file(category, 'unlabeled')
    return [np.array(np.unique(dictionary),dtype='str'), documents, tags, unlabeled_docs, unlabeled_tags]
  
def purge_dataset(dictionary, documents):
    freq = np.zeros(len(dictionary))
    for i, word in enumerate(dictionary):
        for doc in documents:
            try:
                freq[i] = freq[i] + doc[word]
            except:
                pass
    to_remove = []
    for i, occurence in enumerate(zip(dictionary, freq)):
        if occurence[1] == 1:
            to_remove.append(i)
    dictionary = np.delete(dictionary, to_remove)
    return dictionary

def build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, file, boolean):
    if file == 'labeled':
        m = np.zeros((len(dictionary) + 1, len(documents)), dtype = np.bool_ if boolean == 'bool' else np.int8)
        for i, document in enumerate(documents):
            for key in document.keys():
                index = np.where(dictionary == key)[0]
                if len(index) != 0:
                    m[index[0],i] = True if boolean == 'bool' else document[key]
            m[-1, i] = tags[i]
    else:
        m = np.zeros((len(dictionary) + 1, len(unlabeled_docs)), dtype = np.bool_ if boolean == 'bool' else np.int8)
        for i, document in enumerate(unlabeled_docs):
            for key in document.keys():
                index = np.where(dictionary == key)[0]
                if len(index) != 0:
                    m[index[0],i] = True if boolean == 'bool' else document[key]
            m[-1, i] = unlabeled_tags[i]
    return m

In [4]:
for cat in tqdm(categories):
    [dictionary, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset(cat)
    dictionary = purge_dataset(dictionary, documents)
    np.save('./data/SA/'+cat+'/dictionary.npy', dictionary)
    ans = build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool')
    np.save('./data/SA/'+cat+'/bool-labeled.npy',ans)
    ans = build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool')
    np.save('./data/SA/'+cat+'/unlabeled.npy',ans)
    ans = build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool')
    np.save('./data/SA/'+cat+'/labeled.npy',ans)
    ans = build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool')
    np.save('./data/SA/'+cat+'/bool-unlabeled.npy',ans)

100%|██████████| 4/4 [54:30<00:00, 817.72s/it]   


In [4]:
dictionary = np.concatenate((np.load('./data/SA/books/dictionary.npy'),
                            np.load('./data/SA/dvd/dictionary.npy'),
                            np.load('./data/SA/electronics/dictionary.npy'),
                            np.load('./data/SA/kitchen/dictionary.npy')), axis = 0)
dictionary = np.array(np.unique(dictionary),dtype='str')
print(dictionary.shape)

(100801,)


In [5]:
[dictionary1, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset('books')
np.save('./data/SA/all/bool-labeled.npy', build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool'))
np.save('./data/SA/all/bool-unlabeled.npy', build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool'))
np.save('./data/SA/all/labeled.npy', build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool'))
np.save('./data/SA/all/unlabeled.npy', build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool'))

In [6]:
[dictionary1, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset('dvd')
m = np.load('./data/SA/all/bool-labeled.npy')
np.save('./data/SA/all/bool-labeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/bool-unlabeled.npy')
np.save('./data/SA/all/bool-unlabeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/labeled.npy')
np.save('./data/SA/all/labeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool')), axis = 1))
m = np.load('./data/SA/all/unlabeled.npy')
np.save('./data/SA/all/unlabeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool')), axis = 1))

In [7]:
[dictionary1, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset('electronics')
m = np.load('./data/SA/all/bool-labeled.npy')
np.save('./data/SA/all/bool-labeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/bool-unlabeled.npy')
np.save('./data/SA/all/bool-unlabeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/labeled.npy')
np.save('./data/SA/all/labeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool')), axis = 1))
m = np.load('./data/SA/all/unlabeled.npy')
np.save('./data/SA/all/unlabeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool')), axis = 1))

In [14]:
[dictionary1, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset('kitchen')
m = np.load('./data/SA/all/bool-labeled.npy')
np.save('./data/SA/all/bool-labeled.npy', 
       np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/bool-unlabeled.npy')
np.save('./data/SA/all/bool-unlabeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/labeled.npy')
np.save('./data/SA/all/labeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool')), axis = 1))
m = np.load('./data/SA/all/unlabeled.npy')
np.save('./data/SA/all/unlabeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool')), axis = 1))

In [16]:
np.save('./data/SA/all/dictionary.npy', dictionary)

In [18]:
m = np.load('./data/SA/all/bool-labeled.npy')
print(m.shape)
m = np.load('./data/SA/all/bool-unlabeled.npy')
print(m.shape)
m = np.load('./data/SA/all/labeled.npy')
print(m.shape)
m = np.load('./data/SA/all/unlabeled.npy')
print(m.shape)

(100802, 8000)
(100802, 19677)
(100802, 8000)
(100802, 19677)
