# Sentiment Analysis Movie Reviews
This project will analyze whether the reviews of a movie is positive and negative based using IMDB reviews.  The goal is to create a model that will analyze a text and will classify it a negative or positive.

## Import necessary dependencies and settings

In [3]:
import os
#import numpy as np
#import tensorflow as tf

#Using Matplot to draw the Wordcloud
import matplotlib.pyplot as plt

import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import f_classif

#from tensorflow.python.keras import models
#from tensorflow.python.keras.layers import Dense
#from tensorflow.python.keras.layers import Dropout




## Loading the Data
There are 5 files:
1. Train Texts
2. Train Labels
3. Test Texts
4. Test Labels
5. Dev/Unsupervised

In [4]:
imdb_path = '/Users/jairomelo/Desktop/ML/YORK/ML1010/FinalProject/txtImdb'

# Load the dataset
train_texts = []
train_labels = []
test_texts = []
test_labels = []
for dset in ['train', 'test']:
    for cat in ['pos', 'neg']:
        dset_path = os.path.join(imdb_path, dset, cat)
        for fname in sorted(os.listdir(dset_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(dset_path, fname)) as f:
                    if dset == 'train': train_texts.append(f.read())
                    else: test_texts.append(f.read())
                label = 0 if cat == 'neg' else 1
                if dset == 'train': train_labels.append(label)
                else: test_labels.append(label)
                    


In [5]:
#Development

dev_imdb_path = '/Users/jairomelo/Desktop/ML/YORK/ML1010/FinalProject/txtImdb/train'

dev_texts = []
for fname in sorted(os.listdir(dev_imdb_path)):
    if fname.endswith('.txt'):
        with open(os.path.join(dev_imdb_path, fname)) as f:
            dev_texts.append(f.read())



In [6]:
dict_new = {
    'text': train_texts,
    'label': train_labels
}


data_train = pd.DataFrame(dict_new)


dict_new = {
    'text': test_texts,
    'label': test_labels
}

data_test = pd.DataFrame(dict_new)


In [7]:
# remove URL's from train and test
data_train['clean_text'] = data_train['text'].apply(lambda x: re.sub(r'http\S+', '', x))
data_test['clean_text'] = data_test['text'].apply(lambda x: re.sub(r'http\S+', '', x))

In [8]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'
data_train['clean_text'] = data_train['clean_text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
data_test['clean_text'] = data_test['clean_text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
data_train['clean_text'] = data_train['clean_text'].str.lower()
data_test['clean_text'] = data_test['clean_text'].str.lower()

# remove numbers
data_train['clean_text'] = data_train['clean_text'].str.replace("[0-9]", " ")
data_test['clean_text'] = data_test['clean_text'].str.replace("[0-9]", " ")

# remove whitespaces
data_train['clean_text'] = data_train['clean_text'].apply(lambda x:' '.join(x.split()))
data_test['clean_text'] = data_test['clean_text'].apply(lambda x: ' '.join(x.split()))

In [9]:
import string

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data_train['len'] = data_train['text'].apply(lambda x: len(x) - x.count(" "))
data_train['punct%'] = data_train['text'].apply(lambda x: count_punct(x))

data_test['len'] = data_test['text'].apply(lambda x: len(x) - x.count(" "))
data_test['punct%'] = data_test['text'].apply(lambda x: count_punct(x))


## Tokenizing the clean text

In [10]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data_train['clean_text'] = data_train['clean_text'].apply(lambda x: tokenize(x.lower()))
data_test['clean_text'] = data_test['clean_text'].apply(lambda x: tokenize(x.lower()))


## Removing Stop words

In [11]:
stopword = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data_train['clean_text'] = data_train['clean_text'].apply(lambda x: remove_stopwords(x))
data_test['clean_text'] = data_test['clean_text'].apply(lambda x: remove_stopwords(x))

In [12]:
data_train.sample(10)

Unnamed: 0,text,label,clean_text,len,punct%
19933,I have been a huge Lynn Peterson fan ever sinc...,0,"[huge, lynn, peterson, fan, ever, since, break...",660,13.3
1853,Being the prototype of the classical Errol Fly...,1,"[prototype, classical, errol, flynn, adventure...",277,5.1
10777,Was'nt really bad for Raw's first PPV of 006. ...,1,"[nt, really, bad, raw, first, ppv, ending, rea...",2735,5.5
22588,So far Nightmares and Dreamscapes has been err...,0,"[far, nightmares, dreamscapes, erratic, disapp...",1125,4.2
5962,"Most people, when they think of expressionist ...",1,"[people, think, expressionist, cinema, look, b...",2134,5.2
8494,You think you've had it tough? You should chec...,1,"[think, tough, check, film, carl, brashear, ep...",516,4.3
4369,"Franco proves, once again, that he is the prin...",1,"[franco, proves, prince, surreal, erotic, cine...",670,2.5
9266,A linear travel within a non-linear structure....,1,"[linear, travel, within, nonlinear, structure,...",2874,7.5
24144,"i should qualify that title, now that i think ...",0,"[qualify, title, think, checkout, entirely, wo...",1309,3.7
21777,This movie never made it to theaters in our ar...,0,"[movie, never, made, theaters, area, became, a...",620,6.8


In [21]:
# Converting to np.array
train = np.array(data_train)
test = np.array(data_test)
dev = np.array(dev_texts)

import pandas as pd 
pd.DataFrame(train).to_csv("train.txt")
pd.DataFrame(test).to_csv("test.txt")
# Converting to np.array
dev = np.array(dev_texts)

In [22]:
#Loading the clean data and corpus

from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask

# define columns
columns = {0: 'clean_text', 1: 'label', 2: 'len', 3: 'punct'}

# this is the folder in which train, test and dev files reside
data_folder = '/Users/jairomelo/Desktop/ML/YORK/ML1010/FinalProject'
# retrieve corpus using column format, data folder and the names of the train, dev and test files
corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(data_folder, columns,
                                                              train_file='train.txt',
                                                              test_file='test.txt',
                                                              dev_file='dev.txt')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
2019-04-17 14:35:05,315 Reading data from /Users/jairomelo/Desktop/ML/YORK/ML1010/FinalProject
2019-04-17 14:35:05,316 Train: /Users/jairomelo/Desktop/ML/YORK/ML1010/FinalProject/train.txt
2019-04-17 14:35:05,316 Dev: /Users/jairomelo/Desktop/ML/YORK/ML1010/FinalProject/dev.txt
2019-04-17 14:35:05,317 Test: /Users/jairomelo/Desktop/ML/YORK/ML1010/FinalProject/test.txt


In [23]:
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings
from typing import List

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [
    WordEmbeddings('glove')
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer
from flair.training_utils import EvaluationMetric

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-ner',
              EvaluationMetric.MICRO_F1_SCORE,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150,
              checkpoint=True)

# 8. stop training at any point

# 9. continue trainer at later point
from pathlib import Path

trainer = ModelTrainer.load_from_checkpoint(Path('resources/taggers/example-ner/checkpoint.pt'), 'SequenceTagger', corpus)
trainer.train('resources/taggers/example-ner',
              EvaluationMetric.MICRO_F1_SCORE,
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150,
              checkpoint=True)

2019-04-17 14:35:51,980 ----------------------------------------------------------------------------------------------------
2019-04-17 14:35:51,981 Evaluation method: MICRO_F1_SCORE
2019-04-17 14:35:51,983 ----------------------------------------------------------------------------------------------------
2019-04-17 14:39:16,312 epoch 1 - iter 0/1 - loss 16444.18750000
2019-04-17 14:39:16,331 ----------------------------------------------------------------------------------------------------
2019-04-17 14:39:16,332 EPOCH 1 done: loss 16444.1875 - lr 0.1000 - bad epochs 0
2019-04-17 14:39:26,277 DEV  : loss 4853.42578125 - f-score 0.0000 - acc 0.0000
2019-04-17 14:39:35,853 TEST : loss 4942.63085938 - f-score 0.0000 - acc 0.0000
2019-04-17 14:39:40,303 ----------------------------------------------------------------------------------------------------
2019-04-17 14:43:04,148 epoch 2 - iter 0/1 - loss 4786.60937500
2019-04-17 14:43:04,165 ------------------------------------------------

2019-04-17 15:28:28,625 EPOCH 15 done: loss -42.6562 - lr 0.0500 - bad epochs 0
2019-04-17 15:28:37,709 DEV  : loss -0.42187500 - f-score 0.0000 - acc 0.0000
2019-04-17 15:28:47,102 TEST : loss 8.98437500 - f-score 0.0000 - acc 0.0000
2019-04-17 15:28:51,375 ----------------------------------------------------------------------------------------------------
2019-04-17 15:31:52,043 epoch 16 - iter 0/1 - loss 36.12500000
2019-04-17 15:31:52,059 ----------------------------------------------------------------------------------------------------
2019-04-17 15:31:52,059 EPOCH 16 done: loss 36.1250 - lr 0.0500 - bad epochs 0
2019-04-17 15:32:01,543 DEV  : loss -0.09375000 - f-score 0.0000 - acc 0.0000
2019-04-17 15:32:10,664 TEST : loss 37.59375000 - f-score 0.0000 - acc 0.0000
2019-04-17 15:32:12,828 ----------------------------------------------------------------------------------------------------
2019-04-17 15:35:09,250 epoch 17 - iter 0/1 - loss 25.51562500
2019-04-17 15:35:09,268 -----

In [None]:
# 10. find learning rate
learning_rate_tsv = ModelTrainer.find_learning_rate('resources/taggers/example-ner',
                                                    'learning_rate.tsv')

# 11. plot the learning rate finder curve
plotter = Plotter()
plotter.plot_learning_rate(learning_rate_tsv)