In [1]:
import nltk
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import pandas as pd
import numpy as np
from tqdm import tqdm
import os, re

In [2]:
files = ['positive', 'negative']
categories = ['books', 'dvd', 'kitchen', 'electronics']
characters_to_remove = '!()#@~"'
pattern = "[" + characters_to_remove + "]"

In [3]:
neg_score = {}
pos_score = {}
with open('./data/EN_Lexicons/SentiWordNet_3.0.0.txt') as file:
    for i, line in enumerate(file):
        if line[0] == '#':
            continue
        else:
            try:
                splitted = line.split('\t')[:5]
                pos = {splitted[-1].split('#')[0]: float(splitted[2])}
                neg = {splitted[-1].split('#')[0]: float(splitted[3])}
                neg_score.update(neg)
                pos_score.update(pos)
            except:
                pass

In [7]:
ANFII = {}
with open('./data/EN_Lexicons/AFINN-111.txt') as file:
    for line in file:
        item = line.strip('\n').split('\t')
        d = {item[0]: int(item[1])}
        ANFII.update(d)

In [40]:
word_stat_pos = []
word_stat_neg = []
neg_words = False
pos_words = False
with open('./data/EN_Lexicons/WordStat Sentiments.txt') as file:
    for i, line in enumerate(file):
        if line.strip('\t').replace(' ', '_').replace('\n', '') == 'NEGATIVE_WORDS':
            neg_words = True
        if neg_words:
            word_stat_neg.append(line.strip('\t').split(' ')[0].replace(' ','_').lower())
        if line.strip('\t').replace(' ', '_').replace('\n', '') == 'POSITIVE_WORDS':
            neg_words = False
            pos_words = True
        if pos_words:
            word_stat_pos.append(line.strip('\t').split(' ')[0].replace(' ','_').lower())

In [44]:
%run ./data/EN_Lexicons/senticnet5.py

In [53]:
def read_file(category, file):
    words = []
    docs = []
    tags = []
    with open('./data/SA/'+category+'/'+file+'.review', encoding='ISO-8859-1') as file:
        for line in file:
            words_and_freq = re.sub(pattern, "", line.strip('\n').strip('\x1a')).split('label:')
            if words_and_freq[-1] == 'positive':
                tag = int(1)
            else:
                tag = int(0)
            tags.append(tag)
            words_and_freq = words_and_freq[0].split(' ')
            d = {}
            for term in words_and_freq[:-1]:
                split = term.split(':')
                words.append(split[0])
                x = {split[0]:int(split[1])}
                d.update(x)
            docs.append(d)
    return [words, docs, tags]

def build_dataset(category):
    dictionary = []
    documents = []
    tags = []
    for file in files:
        [words_temp, docs_temp, tags_temp] = read_file(category, file)
        words_temp = np.unique(words_temp)
        for term in zip(docs_temp, tags_temp):
            documents.append(term[0])
            tags.append(term[1])
        for word in words_temp:
            dictionary.append(word)
    temp, unlabeled_docs, unlabeled_tags = read_file(category, 'unlabeled')
    return [np.array(np.unique(dictionary),dtype='str'), documents, tags, unlabeled_docs, unlabeled_tags]
  
def purge_dataset(dictionary, documents):
    freq = np.zeros(len(dictionary))
    for i, word in enumerate(dictionary):
        for doc in documents:
            try:
                freq[i] = freq[i] + doc[word]
            except:
                pass
    to_remove = []
    for i, occurence in enumerate(zip(dictionary, freq)):
        if occurence[1] == 1:
            to_remove.append(i)
    dictionary = np.delete(dictionary, to_remove)
    return dictionary

def build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, file, boolean):
    if file == 'labeled':
        m = np.zeros((len(dictionary) + 1, len(documents)), dtype = np.bool_ if boolean == 'bool' else np.int8)
        for i, document in enumerate(documents):
            for key in document.keys():
                index = np.where(dictionary == key)[0]
                if len(index) != 0:
                    m[index[0],i] = True if boolean == 'bool' else document[key]
            m[-1, i] = tags[i]
    else:
        m = np.zeros((len(dictionary) + 1, len(unlabeled_docs)), dtype = np.bool_ if boolean == 'bool' else np.int8)
        for i, document in enumerate(unlabeled_docs):
            for key in document.keys():
                index = np.where(dictionary == key)[0]
                if len(index) != 0:
                    m[index[0],i] = True if boolean == 'bool' else document[key]
            m[-1, i] = unlabeled_tags[i]
    return m

def set_values(pos_count, neg_count, polarity_last, pos_sum, neg_sum, ANFII_pos_count, ANFII_neg_count, 
                      ANFII_pos_score, ANFII_neg_score, stat_pos, stat_neg, polarity_senticnet, file):
    if neg_count == 0:
        div = 0
    else:
        div = pos_count/neg_count
    if file == 'positive':
        tag = 1
    else:
        tag = 0
    return np.array([pos_count, neg_count, polarity_last, pos_sum, neg_sum, ANFII_pos_count, ANFII_neg_count, 
                      ANFII_pos_score, ANFII_neg_score, stat_pos, stat_neg, polarity_senticnet, tag])

def create_lexicon_vector(doc, file):
    neg_count = pos_count = neg_sum = pos_sum = 0
    ANFII_pos_count = ANFII_neg_count = ANFII_pos_score = ANFII_neg_score = 0
    stat_pos = stat_neg = polarity_senticnet = 0
    last_word = True; word_in_lex = False
    words = list(doc.keys())
    polarity_last = 0
    for word in reversed(words):
        list_words = word.split('_')
        for list_word in list_words:
            if list_word in neg_score.keys():
                if neg_score[list_word] > 0:
                    neg_count += 1
                    neg_sum += neg_score[list_word]
                    word_in_lex = True
                if pos_score[list_word] > 0:
                    pos_count += 1
                    pos_sum += pos_score[list_word]
                    word_in_lex = True
                if word_in_lex and last_word:
                    polarity_last = (neg_score[list_word]*-1) + (pos_score[list_word])
                    last_word = False
            if list_word in ANFII.keys():
                if ANFII[list_word] > 0:
                    ANFII_pos_count += 1
                    ANFII_pos_score += ANFII[list_word]
                else:
                    ANFII_neg_count += 1
                    ANFII_neg_score += ANFII[list_word]
            if list_word in word_stat_pos:
                stat_pos += 1
            elif list_word in word_stat_neg:
                stat_neg += 1
            if list_word in list(senticnet.keys()):
                polarity_senticnet += float(senticnet[list_word][7])
    return set_values(pos_count, neg_count, polarity_last, pos_sum, neg_sum, ANFII_pos_count, ANFII_neg_count, 
                      ANFII_pos_score, ANFII_neg_score, stat_pos, stat_neg, polarity_senticnet, file)

In [4]:
for cat in tqdm(categories):
    [dictionary, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset(cat)
    dictionary = purge_dataset(dictionary, documents)
    np.save('./data/SA/'+cat+'/dictionary.npy', dictionary)
    ans = build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool')
    np.save('./data/SA/'+cat+'/bool-labeled.npy',ans)
    ans = build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool')
    np.save('./data/SA/'+cat+'/unlabeled.npy',ans)
    ans = build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool')
    np.save('./data/SA/'+cat+'/labeled.npy',ans)
    ans = build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool')
    np.save('./data/SA/'+cat+'/bool-unlabeled.npy',ans)

100%|██████████| 4/4 [54:30<00:00, 817.72s/it]   


In [4]:
dictionary = np.concatenate((np.load('./data/SA/books/dictionary.npy'),
                            np.load('./data/SA/dvd/dictionary.npy'),
                            np.load('./data/SA/electronics/dictionary.npy'),
                            np.load('./data/SA/kitchen/dictionary.npy')), axis = 0)
dictionary = np.array(np.unique(dictionary),dtype='str')
print(dictionary.shape)

(100801,)


In [5]:
[dictionary1, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset('books')
np.save('./data/SA/all/bool-labeled.npy', build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool'))
np.save('./data/SA/all/bool-unlabeled.npy', build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool'))
np.save('./data/SA/all/labeled.npy', build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool'))
np.save('./data/SA/all/unlabeled.npy', build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool'))

In [6]:
[dictionary1, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset('dvd')
m = np.load('./data/SA/all/bool-labeled.npy')
np.save('./data/SA/all/bool-labeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/bool-unlabeled.npy')
np.save('./data/SA/all/bool-unlabeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/labeled.npy')
np.save('./data/SA/all/labeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool')), axis = 1))
m = np.load('./data/SA/all/unlabeled.npy')
np.save('./data/SA/all/unlabeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool')), axis = 1))

In [7]:
[dictionary1, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset('electronics')
m = np.load('./data/SA/all/bool-labeled.npy')
np.save('./data/SA/all/bool-labeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/bool-unlabeled.npy')
np.save('./data/SA/all/bool-unlabeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/labeled.npy')
np.save('./data/SA/all/labeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool')), axis = 1))
m = np.load('./data/SA/all/unlabeled.npy')
np.save('./data/SA/all/unlabeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool')), axis = 1))

In [14]:
[dictionary1, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset('kitchen')
m = np.load('./data/SA/all/bool-labeled.npy')
np.save('./data/SA/all/bool-labeled.npy', 
       np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/bool-unlabeled.npy')
np.save('./data/SA/all/bool-unlabeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'bool')), axis = 1))
m = np.load('./data/SA/all/labeled.npy')
np.save('./data/SA/all/labeled.npy',
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'labeled', 'not-bool')), axis = 1))
m = np.load('./data/SA/all/unlabeled.npy')
np.save('./data/SA/all/unlabeled.npy', 
        np.concatenate((m, build_matrix(dictionary, documents, tags, unlabeled_docs, unlabeled_tags, 'unlabeled', 'not-bool')), axis = 1))

In [16]:
np.save('./data/SA/all/dictionary.npy', dictionary)

In [56]:
for cat in tqdm(['books', 'dvd', 'electronics', 'kitchen']):
    m = np.zeros((1,13))
    for file in ['positive', 'negative']:
        words, docs, tags = read_file('books', file)
        for doc in docs:
            n = np.reshape(create_lexicon_vector(doc, file), (1,13))
            m = np.append(m, n, axis=0)
    m[~np.all(m == 0, axis=1)]
    np.save('./data/SA/'+cat+'/lexicon-labeled.npy', np.transpose(m[1:,:]))
    words, docs, tags = read_file('books', 'unlabeled')
    m = np.zeros((1,13))
    for doc in docs:
        n = np.reshape(create_lexicon_vector(doc, file), (1,13))
        m = np.append(m, n, axis=0)
    m[~np.all(m == 0, axis=1)]
    np.save('./data/SA/'+cat+'/lexicon-unlabeled.npy', np.transpose(m[1:,:]))

100%|██████████| 4/4 [10:50:08<00:00, 9752.08s/it]  


In [57]:
np.save('./data/SA/all/lexicon-labeled.npy', np.concatenate((np.load('./data/SA/books/lexicon-labeled.npy'),
                                                             np.load('./data/SA/dvd/lexicon-labeled.npy'),
                                                             np.load('./data/SA/electronics/lexicon-labeled.npy'),
                                                             np.load('./data/SA/kitchen/lexicon-labeled.npy')), 
                                                             axis=1))

np.save('./data/SA/all/lexicon-unlabeled.npy', np.concatenate((np.load('./data/SA/books/lexicon-unlabeled.npy'),
                                                             np.load('./data/SA/dvd/lexicon-unlabeled.npy'),
                                                             np.load('./data/SA/electronics/lexicon-unlabeled.npy'),
                                                             np.load('./data/SA/kitchen/lexicon-unlabeled.npy')), 
                                                             axis=1))