# HW02 - Sentiment Analysis

In [1]:
from bs4 import BeautifulSoup
import nltk
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import pandas as pd
import numpy as np

In [2]:
categories = ['books', 'dvd', 'electronics', 'kitchen']
files = ['positive', 'negative', 'unlabeled']

In [3]:
def create_category_dataset(category):
    """ Creates and saves dataset of specified category.
    
    Args:
        category (str): category from which to build de dataset.
    
    Returns:
        (list): contains pandas.core.frame.DataFrame. one with labeled data and one with unlabeled data.
                Each df contains text, tag, bow, boolean bow and lexicon features.
    """
    dfs = []
    for file in files:
        with open('./data/SA/' + category + '/' + file + '.review', encoding = 'ISO-8859-1') as fp:
            soup = BeautifulSoup(fp, "html.parser")
        reviews = soup.find_all("review_text")
        text = []
        tag = []
        for review in reviews:
            text.append(review.text.strip('\n'))
            if file in ['positive', 'negative']:
                tag.append(float(1) if file == 'positive' else float(0))
                d = {'text': text, 'tag': tag}
            else:
                d = {'text': text}
        dfs.append(pd.DataFrame(data = d))
    df = pd.concat([dfs[0], dfs[1]], axis=0)
    df_unlabeled = dfs[2]
    p = PorterStemmer()
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    docDict = []
    for doc in df['text']:
        docDict.append(process(p, tokenizer, doc))
    dictionary = corpora.Dictionary(docDict)
    df['bow'] = df.apply(bow_model, axis=1, args=['not-bool', dictionary, p, tokenizer])
    df_unlabeled['bow'] = df_unlabeled.apply(bow_model, axis=1, args=['not-bool', dictionary, p, tokenizer])
    df['bool-bow'] = df.apply(bow_model, axis=1, args=['bool', dictionary, p, tokenizer])
    df_unlabeled['bool-bow'] = df_unlabeled.apply(bow_model, axis=1, args=['bool', dictionary, p, tokenizer])
    df.to_pickle('./data/SA/' + category + '/labeled.csv')
    df.to_pickle('./data/SA/' + category + '/unlabeled.csv')
    return [df, df_unlabeled]
        
def process(p, tokenizer, text):
    """ Applies standard pre-processing to given text.

    Args:
        p (gensim.parsing.porter.PorterStemmer): stemmer object.
        tokenizer (nltk.tokenize.regexp.RegexpTokenizer): tokenizr object.
        text (str): text to preprocess.

    Returns:
        list: preprocessed text.
    """
    doc_nor = text.lower()
    doc_sw = remove_stopwords(doc_nor)
    doc_stem = p.stem_sentence(doc_sw)
    return tokenizer.tokenize(doc_stem)

def bow_model(param, boolean, dictionary, p, tokenizer):
    """ Returns bow and boolean bow 
        Args:
            param (<class 'pandas.core.series.Series'>): Series from pandas.DataFrame with tag information

        Returns:
        list: Numbers of relevant documents
    """
    unordered_bow = dictionary.doc2bow(process(p, tokenizer, param.text))
    doc_bow = np.zeros(len(dictionary), dtype=np.bool_ if boolean == 'bool' else np.float16)
    for pair in unordered_bow:
        doc_bow[pair[0]] = True if boolean == 'bool' else float(pair[1])
    return np.array(doc_bow)

In [4]:
[df, df_unlabeled] = create_category_dataset('books')

In [5]:
[df, df_unlabeled] = create_category_dataset('electronics')

In [6]:
[df, df_unlabeled] = create_category_dataset('dvd')

In [7]:
[df, df_unlabeled] = create_category_dataset('kitchen')