# HW02 - Sentiment Analysis

In [1]:
import nltk
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [2]:
files = ['positive', 'negative', 'unlabeled']
categories = ['books', 'dvd', 'kitchen', 'electronics']

In [23]:
words = []
freq = []
docs = []
with open('./data/SA/books/positive.review') as file:
    for line in file:
        words_and_freq = line.strip('\n').split('#label#:')[:-1][0]
        words_and_freq = words_and_freq.split(' ')
        d = {}
        for term in words_and_freq[:-1]:
            split = term.split(':')
            words.append(split[0])
            x = {split[0]:int(split[1])}
            d.update(x)
        docs.append(d)

In [15]:
def process(p, tokenizer, text):
    """ Applies standard pre-processing to given text.

    Args:
        p (gensim.parsing.porter.PorterStemmer): stemmer object.
        tokenizer (nltk.tokenize.regexp.RegexpTokenizer): tokenizr object.
        text (str): text to preprocess.

    Returns:
        list: preprocessed text.
    """
    doc_nor = text.lower()
    doc_sw = remove_stopwords(doc_nor)
    doc_stem = p.stem_sentence(doc_sw)
    return tokenizer.tokenize(doc_stem)

def bow_model(param, boolean, dictionary, p, tokenizer):
    """ Returns bow and boolean bow 
        Args:
            param (<class 'pandas.core.series.Series'>): Series from pandas.DataFrame with tag information

        Returns:
            list: Numbers of relevant documents
    """
    unordered_bow = dictionary.doc2bow(process(p, tokenizer, param.text))
    doc_bow = np.zeros(len(dictionary), dtype=np.bool_ if boolean == 'bool' else np.float16)
    for pair in unordered_bow:
        doc_bow[pair[0]] = True if boolean == 'bool' else float(pair[1])
    return np.array(doc_bow)

def process_df(df, boolean):
    p = PorterStemmer()
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    docDict = []
    for doc in df['text']:
        docDict.append(process(p, tokenizer, doc))
    dictionary = corpora.Dictionary(docDict)
    if boolean == 'bow':
        df['bow'] = df.apply(bow_model, axis=1, args=['not-bool', dictionary, p, tokenizer])
    elif boolean == 'bool-bow':
        df['bool-bow'] = df.apply(bow_model, axis=1, args=['bool', dictionary, p, tokenizer])
    return df