# Input sentence

In [1]:
import re
import pandas as pd
import csv
import spacy

PATH = 'PATH/to/IMDB_Dataset.csv'
df = pd.read_csv(PATH)
df['sentiment'] = df['sentiment'].map(lambda x: 1 if x== 'positive' else 0)

In [3]:
def preprocess(text):
    
    text = text.lower()
    
    #Replace html
    text = re.sub('(<.*?\>)', ' ', text)
    
    #Convert www.* or https?://* to URL
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text)

    #Convert @username to AT_USER
    text = re.sub('@[^\s]+','AT_USER',text)

    #Remove additional white spaces
    text = re.sub('[\s]{2,}', ' ', text)

    #Replace #word with word
    text = re.sub(r'#([^\s]+)', r'\1', text)
    
    #trim
    text = text.strip('\'"')
    
    return text

In [4]:
df['review'] = df['review'].map(preprocess)

- tokenize

In [43]:
nlp = spacy.load('en_core_web_sm')
nlp.remove_pipe('ner')

def tokenize(paragraph):
    obj_ = ''
    doc = nlp(paragraph)
    tokens = [tok.text for tok in doc]
    for sent in doc.sents:
        obj_ = ' <sos> ' + ' '.join(tok for tok in tokens[sent.start: sent.end] if tok is not None).lower() + ' <eos>'
        if len(obj_.split()) > 4:
            yield obj_.strip()

In [46]:
sent_res = []  # sentence list
senti_res = [] # sentiment list
for row in df.itertuples():
    # iter original sentence
    rev = row[1]
    senti = row[2]
    for split_sent in tokenize(rev):
        sent_res.append(split_sent)
        senti_res.append(senti)

In [50]:
df_sent = pd.DataFrame({'review': sent_res, 'sentiment': senti_res})
df_sent.sample(20)

Unnamed: 0,review,sentiment
558050,"<sos> oh well .. i guess it 's the whole "" <eos>",0
266170,"<sos> the dialogue was at best mediocre , and ...",0
643529,<sos> only one word can describe mr magoo - sl...,0
173809,<sos> this movie is just filthy ! <eos>,1
186011,"<sos> the japanese may think "" horror shapes ""...",1
202706,<sos> there 's not much to say about him that ...,0
240512,<sos> the submarine used was not varangian ! '...,1
603403,<sos> but i had no idea who was on who 's side...,0
297252,<sos> this film helps you to know through the ...,1
250455,<sos> when me and my gf went to see this film ...,0


In [55]:
df_sent['len'] = df_sent.apply(lambda x: len(x['review'].split()), axis=1)
df_sent.describe()

Unnamed: 0,sentiment,len
count,669880.0,669880.0
mean,0.485406,21.980858
std,0.499787,13.088196
min,0.0,5.0
25%,0.0,13.0
50%,0.0,19.0
75%,1.0,28.0
max,1.0,314.0


In [51]:
fn = 'PATH/TO/SAVE/sentence_dataset.json'
df_sent.to_json(fn, orient='records', lines=True)

- check dataset

In [40]:
sent_res

["<sos> one of the other reviewers has mentioned that after watching just 1 oz episode you 'll be hooked . <eos>",
 '<sos> they are right , as this is exactly what happened with me . <eos>',
 '<sos> the first thing that struck me about oz was its brutality and unflinching scenes of violence , which set in right from the word go . <eos>',
 '<sos> trust me , this is not a show for the faint hearted or timid . <eos>',
 '<sos> this show pulls no punches with regards to drugs , sex or violence . <eos>',
 '<sos> its is hardcore , in the classic use of the word . <eos>',
 '<sos> it is called oz as that is the nickname given to the oswald maximum security state penitentary . <eos>',
 '<sos> it focuses mainly on emerald city , an experimental section of the prison where all the cells have glass fronts and face inwards , so privacy is not high on the agenda . <eos>',
 '<sos> em city is home to many .. aryans , muslims , gangstas , latinos , christians , italians , irish and more .... <eos>',
 '<

In [41]:
senti_res

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [10]:
df['review'].iloc[:4].map(tokenize)

"<sos> one of the other reviewers has mentioned that after watching just 1 oz episode you 'll be hooked . <eos> <sos> they are right , as this is exactly what happened with me . <eos> <sos> the first thing that struck me about oz was its brutality and unflinching scenes of violence , which set in right from the word go . <eos> <sos> trust me , this is not a show for the faint hearted or timid . <eos> <sos> this show pulls no punches with regards to drugs , sex or violence . <eos> <sos> its is hardcore , in the classic use of the word . <eos> <sos> it is called oz as that is the nickname given to the oswald maximum security state penitentary . <eos> <sos> it focuses mainly on emerald city , an experimental section of the prison where all the cells have glass fronts and face inwards , so privacy is not high on the agenda . <eos> <sos> em city is home to many .. aryans , muslims , gangstas , latinos , christians , italians , irish and more .... <eos> <sos> so scuffles , death stares , dod

In [5]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1
...,...,...
49995,i thought this movie did a down right good job...,1
49996,"bad plot, bad dialogue, bad acting, idiotic di...",0
49997,i am a catholic taught in parochial elementary...,0
49998,i'm going to have to disagree with the previou...,0
