# HW02 - Sentiment Analysis

In [4]:
import nltk
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import pandas as pd
import numpy as np
from tqdm import tqdm
import os, re

In [5]:
files = ['positive', 'negative']
categories = ['books', 'dvd', 'kitchen', 'electronics']
characters_to_remove = '!()#@~"'
pattern = "[" + characters_to_remove + "]"

In [6]:
def read_file(category, file):
    words = []
    docs = []
    tags = []
    with open('./data/SA/'+category+'/'+file+'.review', encoding='ISO-8859-1') as file:
        for line in file:
            words_and_freq = re.sub(pattern, "", line.strip('\n').strip('\x1a')).split('#label#:')
            if words_and_freq[-1] == 'positive':
                tag = int(1)
            else:
                tag = int(0)
            tags.append(tag)
            words_and_freq = words_and_freq[0].split(' ')
            d = {}
            for term in words_and_freq[:-1]:
                split = term.split(':')
                words.append(split[0])
                x = {split[0]:int(split[1])}
                d.update(x)
            docs.append(d)
    return [words, docs, tags]

def build_dataset(category):
    dictionary = []
    documents = []
    tags = []
    for file in files:
        [words_temp, docs_temp, tags_temp] = read_file(category, file)
        words_temp = np.unique(words_temp)
        for term in zip(docs_temp, tags_temp):
            documents.append(term[0])
            tags.append(term[1])
        for word in words_temp:
            dictionary.append(word)
    temp, unlabeled_docs, unlabeled_tags = read_file(category, 'unlabeled')
    return [np.array(np.unique(dictionary),dtype='str'), documents, tags, unlabeled_docs, unlabeled_tags]

def get_bow_model(param, dictionary, model):
    doc_model = np.zeros(len(dictionary), dtype=np.bool_ if model == 'bool' else np.float16)
    doc = param.document
    for word in doc.keys():
        index = np.where(dictionary == word)[0]
        if len(index) == 0:
            continue
        else:
            doc_model[index[0]] = 1 if model == 'bool' else doc[word]
    return doc_model

def create_dataframe(category):
    [dictionary, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset(category)
    df_labeled = pd.DataFrame(data = {'document': documents, 'tags':tags})
    df_unlabeled = pd.DataFrame(data = {'document': unlabeled_docs, 'tags':unlabeled_tags})
    df_labeled['bool-bow'] = df_labeled.apply(get_bow_model, axis=1, args=[dictionary, 'bool'])
    df_labeled['bow'] = df_labeled.apply(get_bow_model, axis=1, args=[dictionary, 'not-bool'])
    df_unlabeled['bool-bow'] = df_unlabeled.apply(get_bow_model, axis=1, args=[dictionary, 'bool'])
    df_unlabeled['bow'] = df_unlabeled.apply(get_bow_model, axis=1, args=[dictionary, 'not-bool'])
    df_labeled.to_pickle('./data/SA/' + category + '/labeled.pkl')
    df_unlabeled.to_pickle('./data/SA/' + category + '/unlabeled.pkl')

In [None]:
for cat in categories:
    create_dataframe(cat)

In [None]:
for cat in categories:
    [dictionary, documents, tags, unlabeled_docs, unlabeled_tags] = build_dataset(cat)
    np.save('./data/SA/'+cat+'/dictionary.npy', dictionary)
    
dictionary = np.concatenate((np.load('./data/SA/books/dictionary.npy'), 
                             np.load('./data/SA/dvd/dictionary.npy'),
                             np.load('./data/SA/kitchen/dictionary.npy'), 
                             np.load('./data/SA/electronics/dictionary.npy')))
dictionary = np.unique(dictionary)
np.save('./data/SA/all/dictionary.npy', dictionary)
print(len(dictionary))

In [7]:
dictionary = np.load('./data/SA/all/dictionary.npy')
for cat in tqdm(categories):
    df = pd.read_pickle('./data/SA/' + cat + '/labeled.pkl')
    bow_model = np.array(df.apply(get_bow_model, axis=1, args=[dictionary, 'not-bool']))
    np.save('./data/SA/all/'+cat+'_labeled.npy', bow_model)
    print(str(cat) + 'labeled finished')
    bow_model = np.array(df.apply(get_bow_model, axis=1, args=[dictionary, 'bool']))
    np.save('./data/SA/all/'+cat+'_bool_labeled.npy', bow_model)
    print(str(cat) + 'labeled finished')

  0%|          | 0/4 [00:00<?, ?it/s]

bookslabeled finished


 25%|██▌       | 1/4 [2:02:21<6:07:05, 7342.00s/it]

bookslabeled finished
dvdlabeled finished


 50%|█████     | 2/4 [4:01:58<4:01:29, 7244.74s/it]

dvdlabeled finished
kitchenlabeled finished


 75%|███████▌  | 3/4 [5:09:20<1:36:22, 5782.30s/it]

kitchenlabeled finished
electronicslabeled finished


100%|██████████| 4/4 [6:28:22<00:00, 5825.57s/it]  

electronicslabeled finished





In [None]:
for cat in tqdm(['dvd', 'kitchen', 'electronics']):
    df = pd.read_pickle('./data/SA/' + cat + '/unlabeled.pkl')
    bow_model = np.array(df.apply(get_bow_model, axis=1, args=[dictionary, 'not-bool']))
    np.save('./data/SA/all/'+cat+'_unlabeled.npy', bow_model)
    print(str(cat) + 'unlabeled finished')
    bow_model = np.array(df.apply(get_bow_model, axis=1, args=[dictionary, 'bool']))
    np.save('./data/SA/all/'+cat+'_bool_unlabeled.npy', bow_model)
    print(str(cat) + 'unlabeled finished')

  0%|          | 0/3 [00:00<?, ?it/s]