# HW02 - Sentiment Analysis

In [1]:
from bs4 import BeautifulSoup
import nltk
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [2]:
def create_category_dataset(categories):
    """ Creates and saves dataset of specified category.
    
    Args:
        category (str): category from which to build de dataset.
    
    Returns:
        (list): contains pandas.core.frame.DataFrame. one with labeled data and one with unlabeled data.
                Each df contains text, tag, bow, boolean bow and lexicon features.
    """
    files = ['positive', 'negative', 'unlabeled']
    if categories == 'all':
        category = ['books', 'dvd', 'electronics', 'kitchen']
        os.remove('./data/SA/all/labeled.csv')
        os.remove('./data/SA/all/unlabeled.csv')
    else:
        category = [categories]
        os.remove('./data/SA/'+categories+'/labeled.csv')
        os.remove('./data/SA/'+categories+'/unlabeled.csv')
    for file in files:
        for category_instance in category:
            df = pd.DataFrame()
            with open('./data/SA/' + category_instance + '/' + file + '.review', encoding = 'ISO-8859-1') as fp:
                soup = BeautifulSoup(fp, "html.parser")
            reviews = soup.find_all('review_text')
            for review in reviews:
                if file == 'unlabeled':
                    d = {'text': [review.text.strip('\n')]}
                else:
                    tag = float(1) if file == 'positive' else float(0)
                    d = {'text': [review.text.strip('\n')], 'tag': [tag]}
                df1 = pd.DataFrame(data=d)
                df = pd.concat([df,df1], axis=0)
            try:
                if file == 'unlabeled':
                    df_last = pd.read_csv('./data/SA/'+categories+'/unlabeled.csv', index_col=0)
                    df_last = pd.concat([df_last, df], axis=0)
                    df_last.to_csv('./data/SA/'+categories+'/unlabeled.csv')
                else:
                    df_last = pd.read_csv('./data/SA/'+categories+'/labeled.csv', index_col=0)
                    df_last = pd.concat([df_last, df], axis=0)
                    df_last.to_csv('./data/SA/'+categories+'/labeled.csv')
            except:
                if file == 'unlabeled':
                    df.to_csv('./data/SA/'+categories+'/unlabeled.csv')
                else:
                    df.to_csv('./data/SA/'+categories+'/labeled.csv')
        
def process(p, tokenizer, text):
    """ Applies standard pre-processing to given text.

    Args:
        p (gensim.parsing.porter.PorterStemmer): stemmer object.
        tokenizer (nltk.tokenize.regexp.RegexpTokenizer): tokenizr object.
        text (str): text to preprocess.

    Returns:
        list: preprocessed text.
    """
    doc_nor = text.lower()
    doc_sw = remove_stopwords(doc_nor)
    doc_stem = p.stem_sentence(doc_sw)
    return tokenizer.tokenize(doc_stem)

def bow_model(param, boolean, dictionary, p, tokenizer):
    """ Returns bow and boolean bow 
        Args:
            param (<class 'pandas.core.series.Series'>): Series from pandas.DataFrame with tag information

        Returns:
            list: Numbers of relevant documents
    """
    unordered_bow = dictionary.doc2bow(process(p, tokenizer, param.text))
    doc_bow = np.zeros(len(dictionary), dtype=np.bool_ if boolean == 'bool' else np.float16)
    for pair in unordered_bow:
        doc_bow[pair[0]] = True if boolean == 'bool' else float(pair[1])
    return np.array(doc_bow)

def process_df(path_to_df):
    df = pd.read_csv(path_to_df, index_col=0)
    p = PorterStemmer()
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    docDict = []
    for doc in df['text']:
        docDict.append(process(p, tokenizer, doc))
    dictionary = corpora.Dictionary(docDict)
    df['bow'] = df.apply(bow_model, axis=1, args=['not-bool', dictionary, p, tokenizer])
    df['bool-bow'] = df.apply(bow_model, axis=1, args=['bool', dictionary, p, tokenizer])
    df.to_pickle(path_to_df)

In [None]:
for cat in tqdm(['books', 'electronics', 'dvd', 'kitchen', 'all']):
    create_category_dataset(cat)
path_list = ['./data/SA/books/labeled.csv', './data/SA/books/unlabeled.csv', './data/SA/dvd/labeled.csv',
             './data/SA/dvd/unlabeled.csv', './data/SA/kitchen/labeled.csv', './data/SA/kitchen/unlabeled.csv',
             './data/SA/electronics/labeled.csv', './data/SA/electronics/unlabeled.csv', 
             './data/SA/all/labeled.csv', './data/SA/all/unlabeled.csv']

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
for df_path in tqdm(path_list):
    try:
        process_df(df_path)
    except:
        pass

In [None]:
for df_path in path_list:
    try:
        df = pd.read_csv(df_path, index_col=0)
        print(df.head())
    except:
        pass