In [None]:
import glob
import numpy as np
import pandas as pd

def read_df(class_):
    l = [open(filename).read() for filename in glob.glob(
        "./nonmovies/review_polarity/txt_sentoken/{}/*.txt".format(class_)
    )]
    df = pd.Series(l)
    return df

In [None]:
pos = read_df('pos')
neg = read_df('neg')

In [None]:
texts = pd.DataFrame(pd.concat([pos, neg]))

In [None]:
texts.to_csv("./nonmovies/review_polarity/whole.csv", index=False, header=None)

# 20 news commentary

In [None]:
import glob
import numpy as np
import pandas as pd

def read_df():
    l = [open(filename, errors='ignore').read() for filename in glob.glob(
        "./nonmovies/20news-18828/*/*".format()
    )]
    df = pd.Series(l)
    return df

In [None]:
all_ = read_df()

In [None]:
all_ = all_.apply(lambda red: ' '.join([x for x in red.replace('\t', ' ').split('\n') if x and not (x.startswith('From:') or x.startswith('Subject: ') or x.startswith('>') or x.startswith('|>') or x.startswith('In article'))]))

In [None]:
all_ = pd.DataFrame(all_)
all_.columns = ['text']
news = preprocess_sentiment_df(all_)
news = preprocess_other_dataset(news, './nonmovies/news_e.npy', 5044)

# Reuters

In [None]:
from bs4 import BeautifulSoup,SoupStrainer

In [None]:

def read_df():
    l = sum([[
            ' '.join(x.text.replace('\t', ' ').split('\n')) for x in BeautifulSoup(open(filename, errors='ignore').read(), "lxml").findAll('text')
        ] for filename in glob.glob("./nonmovies/reuters21578/reut2-*.sgm")], [])
    df = pd.Series(l)
    return df


In [None]:
df = read_df()

In [None]:
reuters = pd.DataFrame(df)
reuters.columns = ['text']
reuters = preprocess_sentiment_df(reuters)
reuters = preprocess_other_dataset(reuters, './nonmovies/reuters_e.npy', 5044)

# Get customer data review

In [None]:
import glob
import numpy as np
import pandas as pd


def read_df():
    files = [open(filename).read().split('[t]') for filename in glob.glob(
        "./nonmovies/customer review data/*.txt".format()
    )]
    
    return files

In [None]:
files = read_df()

In [None]:
reviews = pd.Series(np.concatenate([f[1:] for f in files]))

In [None]:
reviews = reviews.str.extractall('##(.*?)\\n')

In [None]:
reviews[0] = ' ' + reviews[0]

In [None]:
reviews_processed = reviews.reset_index()[['level_0', 0]].groupby('level_0')[0].sum().reset_index()[0]

In [None]:
reviews_processed.to_csv("./nonmovies/customer review data/whole.csv", header=None, index=None)

In [None]:
'''
Symbols used in the annotated reviews:

  [t]: the title of the review: Each [t] tag starts a review.
       We did not use the title information in our papers.
  xxxx[+|-n]: xxxx is a product feature.
      [+n]: Positive opinion, n is the opinion strength: 3 strongest,
            and 1 weakest. Note that the strength is quite subjective.
            You may want ignore it, but only considering + and -
      [-n]: Negative opinion
  \##  : start of each sentence. Each line is a sentence.
  [u] : feature not appeared in the sentence.
  [p] : feature not appeared in the sentence. Pronoun resolution is needed.
  [s] : suggestion or recommendation.
  [cc]: comparison with a competing product from a different brand.
  [cs]: comparison with a competing product from the same brand.
'''


# Further preprocess

In [None]:
import pandas as pd
from utilities.text_preprocessing import preprocess_sentiment_df
from utilities.text_preprocessing import preprocess_other_dataset

In [None]:
crd = pd.read_csv("./nonmovies/customer review data/whole.csv", index_col=None, header=None)
rp = pd.read_csv("./nonmovies/review_polarity/whole.csv", index_col=None, header=None)

In [None]:
rp.columns = ['text']
crd.columns = ['text']
crd2 = preprocess_sentiment_df(crd)
rp2 = preprocess_sentiment_df(rp)
crd3 = preprocess_other_dataset(crd2, './nonmovies/crd_e.npy', 5044)
rp3 = preprocess_other_dataset(rp2, './nonmovies/rp_e.npy', 5044)

# Main dataset

In [None]:
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

def read_df(set_, class_):
    l = [open(filename).read() for filename in glob.glob("./movies/aclImdb/{}/{}/*.txt".format(set_, class_))]
    df = pd.Series(l)
    return df

In [None]:
train_pos = read_df('train', 'pos')
test_pos = read_df('test', 'pos')
train_neg = read_df('train', 'neg')
test_neg = read_df('test', 'neg')
texts = pd.DataFrame(pd.concat([train_pos, train_neg, test_pos, test_neg]))

In [None]:
texts.reset_index(drop=True, inplace=True)
texts.columns = ['text']

In [None]:
class_ = np.zeros(50000, dtype=np.int)
class_[:12500] = 1
class_[25000:37500] = 1
texts['class'] = class_
texts['set'] = 'train'
texts['set'][25000:] = 'test'
texts.to_csv('./movies/whole.csv', index=False)

# Pass texts through embedding

In [None]:
import pandas as pd
from utilities.text_preprocessing import preprocess_sentiment_df
from utilities.text_preprocessing import get_occurences

In [None]:
texts = pd.read_csv('./movies/whole.csv')

In [None]:
texts = preprocess_sentiment_df(texts)

In [None]:
for_embedding = get_occurences(texts)