In [1]:
import pandas as pd

dataset = pd.read_csv(r"movie_reviews.csv.bz2")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null object
dtypes: object(2)
memory usage: 781.3+ KB


In [2]:
import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata


def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text
import re
contractions_dict = {
    'didn\'t': 'did not',
    'don\'t': 'do not',
    "aren't": "are not",
    "can't": "cannot",
    "cant": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "didnt": "did not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont" : "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i had",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'm": "i am",
    "im": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had",
    "she'd've": "she would have",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "that's": "that is",
    "there's": "there is",
    "they'd": "they had",
    "they'd've": "they would have",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who's": "who is",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
    }

contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(s, contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)


def pre_process_corpus(docs):
    norm_docs = []
    for doc in tqdm.tqdm(docs):
        doc = strip_html_tags(doc)
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        doc = expand_contractions(doc)
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()  
        norm_docs.append(doc)
  
    return norm_docs

In [3]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
reviews = dataset['review'].values
sentiment = dataset['sentiment'].values

In [5]:
train_Reviews = reviews[:35000]
train_sentiments = sentiment[:35000]

test_reviews = reviews[35000:]
test_sentiments = sentiment[35000:]


In [6]:
%%time 
norm_train_reviews = pre_process_corpus(train_Reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|███████████████████████████████████████████████████████████████████████████| 35000/35000 [00:41<00:00, 840.76it/s]
100%|███████████████████████████████████████████████████████████████████████████| 15000/15000 [00:17<00:00, 861.89it/s]

Wall time: 59.5 s





In [7]:
%%time 
from sklearn.feature_extraction.text import CountVectorizer

#build BOW 
cv = CountVectorizer(binary = False, min_df = 5,max_df = 1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)

Wall time: 43.5 s


In [8]:
cv_test_features = cv.transform(norm_test_reviews)

In [9]:
%%time 

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty = 'l2',max_iter = 500,C= 1,solver = 'lbfgs')

lr.fit(cv_train_features,train_sentiments)

Wall time: 1min 20s




In [11]:
lr_predictions = lr.predict(cv_test_features)

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(test_sentiments,lr_predictions)

0.9042

In [14]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative','positive']
print(classification_report(test_sentiments,lr_predictions))

              precision    recall  f1-score   support

    negative       0.90      0.90      0.90      7490
    positive       0.90      0.90      0.90      7510

   micro avg       0.90      0.90      0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



In [16]:
labels = ['negative','positive']
pd.DataFrame(confusion_matrix(test_sentiments,lr_predictions),index = labels,columns = labels)

Unnamed: 0,negative,positive
negative,6769,721
positive,716,6794


In [None]:
tv = tfidfVectorizer(use_idf = True,min_df = 5,max_df = 1.0,ngram_range=(1,2))

In [None]:
tv_train_features = tv.fit_transform(norm_train_reviews)
tv_test_features = tv.transform(norm_test_reviews)

In [None]:
lr.fit(tv_train_features,train_sentiments)

In [None]:
lr_predictions = lr.predict(cv_test_features)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative','positive']
print(classification_report(test_sentiments,lr_predictions))

In [None]:
labels = ['negative','positive']
pd.DataFrame(confusion_matrix(test_sentiments,lr_predictions),index = labels,columns = labels)

In [17]:
## Try with random forest