In [1]:
import pandas as pd
import glob

In [2]:
def pandas_loader(type, folder):
    if folder == "pos":
        num = 1
    elif folder == "neg":
        num = 0
    folder_path = type + "/" + folder + "/*"
    file_list = glob.glob(folder_path)
    with open(file_list[0]) as f:
        first_string = f.readlines()
    f.close()
    data = {'Review_Text': first_string, 'isPos': num}
    df = pd.DataFrame(data)

    for i in range(1, len(file_list)):
        with open(file_list[i]) as f:
            temp_string = f.readlines()
        f.close()
        temp_data = {'Review_Text': temp_string, 'isPos': num}
        temp_df = pd.DataFrame(temp_data)
        df = pd.concat([df, temp_df], axis=0)
    
    return df
    
    

In [3]:
train_pos_df = pandas_loader("train", "pos")
train_neg_df = pandas_loader("train", "neg")

test_pos_df = pandas_loader("test", "pos")
test_neg_df = pandas_loader("test", "neg")

In [4]:
train_pos_df

Unnamed: 0,Review_Text,isPos
0,My comment is limited generally to the first s...,1
0,"From the writer of ""What Ever Happened to Baby...",1
0,I was curious to know how critics responded to...,1
0,"I have to agree with MR. Caruso Jr Lanza,s was...",1
0,This movie is about a fictional soap opera. It...,1
...,...,...
0,When I first heard about this series on AnimeT...,1
0,I liked this movie because it basically did mo...,1
0,I first saw this movie in the theater when I w...,1
0,This begins a wager between Edgar Allen Poe an...,1


In [5]:
train_neg_df

Unnamed: 0,Review_Text,isPos
0,"I found this to be an utter waste of time, eff...",0
0,I have been using IMDb for years and I never w...,0
0,This is the kind of movie that wants to be goo...,0
0,Good grief I can't even begin to describe how ...,0
0,i saw this movie last night and even after a c...,0
...,...,...
0,I loved the first season. The quality went dow...,0
0,While the overall idea of Escape from Atlantis...,0
0,"Man, this movie sucked big time! I didn't even...",0
0,"Encompassing virtual reality, the potential of...",0


In [6]:
test_pos_df

Unnamed: 0,Review_Text,isPos
0,"The movie was excellent, save for some of the ...",1
0,Take a look at those faces alongside the entra...,1
0,A wonderful story that should be seen by all f...,1
0,"almost 4 years after the events of 911, if ask...",1
0,"This is a pretty clever, well-acted version of...",1
...,...,...
0,This is one of the best TV movies I have ever ...,1
0,This review is based on the Producer's Cut: <b...,1
0,I wonder why I haven't heard of this movie bef...,1
0,Highlighting the acting of Sidney Poitier and ...,1


In [7]:
test_neg_df

Unnamed: 0,Review_Text,isPos
0,...But it definitely still only deserves 4/10 ...,0
0,"Unlike the other spaghetti Westerns, this one ...",0
0,"Nothing but the void, a pleasant one for those...",0
0,It's difficult to express how bad this movie i...,0
0,The word honor should be erased from the vocab...,0
...,...,...
0,I was a fan of the AMERICAN WEREWOLF IN LONDON...,0
0,"This is absolute drivel, designed to shock and...",0
0,"The MTV sci-fi animated series ""Æon Flux"" is b...",0
0,"Actually, Son of the Mask did make me laugh a ...",0


In [8]:
training_df = pd.concat([train_pos_df, train_neg_df], axis=0)
testing_df = pd.concat([test_pos_df, test_neg_df], axis=0)

In [9]:
training_df = training_df.replace(r'\r+|\n+|\t+',' ', regex=True)
testing_df = testing_df.replace(r'\r+|\n+|\t+',' ', regex=True)

In [10]:
import nltk
from nltk.corpus import stopwords

In [11]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/null/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
stopwords = stopwords.words("english")

In [29]:
def remove_punc(text):
    final = "".join(u for u in text if u not in ("?", ".", ";", ":", "!", '"', ",", "(", ")", "-", "<", "/><br", "br", "/>br", "'", "'s"))
    return final


In [30]:
training_df["Review_Text"] = training_df["Review_Text"].apply(remove_punc)
training_df["Review_Text"] = training_df["Review_Text"].apply(str.lower)
training_df["Review_Text"] = training_df["Review_Text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
training_df.head()

Unnamed: 0,Review_Text,isPos,Stemmed_Review_Text
0,comment limited generally first season 195960b...,1,comment limit gener first season 195960br />br...
0,writer ever happened baby jane hush hush sweet...,1,writer ever happen babi jane hush hush sweet c...
0,curious know critics responded rousing inspiri...,1,curiou know critic respond rous inspir film we...
0,agree mr caruso jr lanzas finest voice god off...,1,agre mr caruso jr lanza finest voic god offer ...
0,movie fictional soap opera fast funny say anyt...,1,movi fiction soap opera fast funni say anyth e...


In [31]:
testing_df["Review_Text"] = testing_df["Review_Text"].apply(remove_punc)
testing_df["Review_Text"] = testing_df["Review_Text"].apply(str.lower)
testing_df["Review_Text"] = testing_df["Review_Text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
testing_df.head()

Unnamed: 0,Review_Text,isPos,Stemmed_Review_Text
0,movie excellent save scenes esposito enjoyed b...,1,movi excel save scene esposito enjoy brought t...
0,take look faces alongside entrance jail theyre...,1,take look face alongsid entranc jail they'r fa...
0,wonderful story seen families story acting pro...,1,wonder stori seen famili stori act product val...
0,almost 4 years events 911 asked comes mind day...,1,almost 4 year event 911 ask come mind day peop...
0,pretty clever wellacted version modern 30s wom...,1,pretti clever wellact version modern 30 woman'...


In [32]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [33]:
def stemmed_review(review):
    tokens = review.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [34]:
training_df["Stemmed_Review_Text"] = training_df["Review_Text"].apply(stemmed_review)

In [35]:
testing_df["Stemmed_Review_Text"] = testing_df["Review_Text"].apply(stemmed_review)

In [36]:
training_df.head()

Unnamed: 0,Review_Text,isPos,Stemmed_Review_Text
0,comment limited generally first season 195960b...,1,comment limit gener first season 195960br />br...
0,writer ever happened baby jane hush hush sweet...,1,writer ever happen babi jane hush hush sweet c...
0,curious know critics responded rousing inspiri...,1,curiou know critic respond rous inspir film we...
0,agree mr caruso jr lanzas finest voice god off...,1,agre mr caruso jr lanza finest voic god offer ...
0,movie fictional soap opera fast funny say anyt...,1,movi fiction soap opera fast funni say anyth e...


In [37]:
testing_df.head()

Unnamed: 0,Review_Text,isPos,Stemmed_Review_Text
0,movie excellent save scenes esposito enjoyed b...,1,movi excel save scene esposito enjoy brought t...
0,take look faces alongside entrance jail theyre...,1,take look face alongsid entranc jail theyr fac...
0,wonderful story seen families story acting pro...,1,wonder stori seen famili stori act product val...
0,almost 4 years events 911 asked comes mind day...,1,almost 4 year event 911 ask come mind day peop...
0,pretty clever wellacted version modern 30s wom...,1,pretti clever wellact version modern 30 woman ...


In [38]:
print(training_df["Stemmed_Review_Text"][0])

0    comment limit gener first season 195960br />br...
0    writer ever happen babi jane hush hush sweet c...
0    curiou know critic respond rous inspir film we...
0    agre mr caruso jr lanza finest voic god offer ...
0    movi fiction soap opera fast funni say anyth e...
                           ...                        
0    love first season qualiti went littl bit secon...
0    overal idea escap atlanti intrigu found film f...
0    man movi suck big time even manag see hole thi...
0    encompass virtual realiti potenti comput commu...
0    1973 remak classic 1944 billi wilder film doub...
Name: Stemmed_Review_Text, Length: 25000, dtype: object


In [39]:
print(testing_df["Stemmed_Review_Text"][0])

0    movi excel save scene esposito enjoy brought t...
0    take look face alongsid entranc jail theyr fac...
0    wonder stori seen famili stori act product val...
0    almost 4 year event 911 ask come mind day peop...
0    pretti clever wellact version modern 30 woman ...
                           ...                        
0    fan american werewolf london movi curiou wheth...
0    absolut drivel design shock titil 60 mindset a...
0    mtv scifi anim seri æon flux brought life char...
0    actual son mask make laugh timesmostli due car...
0    complet disagre previou review movi amus momen...
Name: Stemmed_Review_Text, Length: 25000, dtype: object


In [40]:
testing_df.to_csv("testing_corpus.csv", index=False)
training_df.to_csv("training_corpus.csv", index=False)