# Load dataset

The dataset has IMDB movie reviwes, already divided with 25K reviews for train and test and has the following structure:

* train
    * pos -> the positive classified reviews in the train set
    * neg -> the negative classified reviews in the train set

* test
    * pos -> the positive classified reviews in the test set
    * neg -> the negative classified reviews in the test set
    
So the first step is to import the data

## Load train data

In [65]:
import glob as glob
import io as io
import csv

files = [file for file in glob.glob("./dataset/train/pos/*")]
index = 1

for file_name in files:
    # open file eoth review
    with io.open(file_name, 'rb') as image_file:
        content = image_file.read()
    # open csv file to store data
    with open('train.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        # pos -> 1; neg -> 0
        if index == 1:
            writer.writerow(['review', 'sentiment'])
        writer.writerow([content, 1])
    index = index + 1
    
files = [file for file in glob.glob("./dataset/train/neg/*")]
index = 1

for file_name in files:
    with io.open(file_name, 'rb') as image_file:
        content = image_file.read()
    with open('train.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        # pos -> 1; neg -> 0
        writer.writerow([content, 0])
    index = index + 1
    

## Load test data

In [66]:
import glob as glob
import io as io
import csv

files = [file for file in glob.glob("./dataset/test/pos/*")]
index = 1

for file_name in files:
    with io.open(file_name, 'rb') as image_file:
        content = image_file.read()
    with open('test.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        # pos -> 1; neg -> 0
        if index == 1:
            writer.writerow(['review', 'sentiment'])
        writer.writerow([content, 1])
    index = index + 1
    
files = [file for file in glob.glob("./dataset/test/neg/*")]
index = 1

for file_name in files:
    with io.open(file_name, 'rb') as image_file:
        content = image_file.read()
    with open('test.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        # pos -> 1; neg -> 0
        writer.writerow([content, 0])
    index = index + 1

## Import the data

In [39]:
import pandas as pd

train = pd.read_csv("./dataset/train.csv")
test = pd.read_csv("./dataset/test.csv")

print(train)
print(test)

                                                  review  sentiment
0      b'Bromwell High is a cartoon comedy. It ran at...          1
1      b'Homelessness (or Houselessness as George Car...          1
2      b'Brilliant over-acting by Lesley Ann Warren. ...          1
3      b'This is easily the most underrated film inn ...          1
4      b'This is not the typical Mel Brooks film. It ...          1
...                                                  ...        ...
24995  b"Towards the end of the movie, I felt it was ...          0
24996  b'This is the kind of movie that my enemies co...          0
24997  b"I saw 'Descent' last night at the Stockholm ...          0
24998  b"Some films that you pick up for a pound turn...          0
24999  b"This is one of the dumbest films, I've ever ...          0

[25000 rows x 2 columns]
                                                  review  sentiment
0      b"I went and saw this movie last night after b...          1
1      b'Actor turned 

## Pre-processing

### Initial text pre-processing 

* Remove 'b string and \<br /> tag in each review
* Lowercasing each review
* Remove special characters (", ', \\, (, ))

In [40]:
import re

def parseText(text):
    '''
    text = text.replace("b'", "")
    text = text.replace("b\"", "")
    text = text.replace("'", "")
    text = text.replace("\"", "")
    text = text.replace("\\", "")
    text = text.replace("<br />", "")
    text = text.replace(")", "")
    text = text.replace("(", "")
    text = text.replace("  ", "")
    text = text.lower()
    '''
    
    text = text.replace("b'", "")
    text = text.replace("b\"", "")
    text = text.replace("<br />", "")
    text = text.replace("<br/>", "")
    text = text.replace("<br >", "")
    text = text.replace("<br>", "")
    text = text.replace("'", " ")
    text = text.replace("  ", " ")
    text = re.sub('[^A-Za-z0-9 ]+', '', text).lower()
    return text

In [41]:
for i in range(0, len(train['review']) ):
    train.loc[i, 'review'] = parseText(train.loc[i, 'review'])
    
for i in range(0, len(test['review']) ):
    test.loc[i, 'review'] = parseText(test.loc[i, 'review'])
    
deep_train = train
deep_test = test
print(train)

                                                  review  sentiment
0      bromwell high is a cartoon comedy it ran at th...          1
1      homelessness or houselessness as george carlin...          1
2      brilliant overacting by lesley ann warren best...          1
3      this is easily the most underrated film inn th...          1
4      this is not the typical mel brooks film it was...          1
...                                                  ...        ...
24995  towards the end of the movie i felt it was too...          0
24996  this is the kind of movie that my enemies cont...          0
24997  i saw descent last night at the stockholm film...          0
24998  some films that you pick up for a pound turn o...          0
24999  this is one of the dumbest films i ve ever see...          0

[25000 rows x 2 columns]


## Create values to train and test

In [16]:
# X_train = train.loc[:24999, 'review'].values
# y_train = train.loc[:24999, 'sentiment'].values.astype(int)    

# X_test = test.loc[:24999, 'review'].values
# y_test = test.loc[:24999, 'sentiment'].values.astype(int) 

## Remove stop words

In [42]:
from nltk.corpus import stopwords

english_stops = set(stopwords.words('english'))

def load_dataset(dataset):
    x_data = dataset['review']   
    y_data = dataset['sentiment']

#     x_data = x_data.replace({'<.*?>': ''}, regex = True)
#     x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])
#     x_data = x_data.apply(lambda review: [w.lower() for w in review])
    return x_data, y_data


X_train, y_train = load_dataset(train)
X_test, y_test = load_dataset(test)



# X_test, y_test = load_dataset(twitter_data)
# X_test, y_test = load_dataset(small_reviews)
# X_test, y_test = load_dataset(small_train)


# print('Reviews')
# print(X_train, '\n')
# print('Sentiment')
# print(y_train)

## Tokenize and Padd

In [43]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import numpy as np

def get_max_length():
    review_length = []
    for review in X_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))



token = Tokenizer(lower=False)
token.fit_on_texts(X_train)
token.fit_on_texts(X_test)
X_train = token.texts_to_sequences(X_train)
X_test = token.texts_to_sequences(X_test)

max_length = get_max_length()

X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', X_train, '\n')
print('Encoded X Test\n', X_test, '\n')
print('Maximum review length: ', max_length)


Encoded X Train
 [[30526   206   905 ...     0     0     0]
 [13250  2581   349 ...  5183   219   280]
 [  382  3574 16181 ...     0     0     0]
 ...
 [ 1307  3051    45 ...    14     1    17]
 [   29  1038  6393 ...     0     0     0]
 [    3  6731    29 ...     0     0     0]] 

Encoded X Test
 [[  283    97     1 ...     0     0     0]
 [   19 11465   414 ...   834    63   187]
 [  357  3411   794 ...   357    20   310]
 ...
 [10556  2454   690 ...    20    12    43]
 [  547    79  3765 ...     0     0     0]
 [ 5342  1487  7453 ...    32 78023   898]] 

Maximum review length:  118


# Classification with Naive Bayes

## Tokenization

In [26]:
from nltk.tokenize import TweetTokenizer

# Instantiate a tweet tokenizer that will preserve each word (or token) as it is
tweet_tokenizer = TweetTokenizer(
    preserve_case = True,
    reduce_len    = False,
    strip_handles = False)

# deep_train
# deep_test

classification_train = deep_train.copy()
classification_test = deep_test.copy()

classification_train['review'] = [tweet_tokenizer.tokenize(n) for n in classification_train['review']]
classification_test['review'] = [tweet_tokenizer.tokenize(n) for n in classification_test['review']]

# Example review
print(classification_train.loc[0])
print(classification_test.loc[0])

review       [bromwell, high, is, a, cartoon, comedy, it, r...
sentiment                                                    1
Name: 0, dtype: object
review       [i, went, and, saw, this, movie, last, night, ...
sentiment                                                    1
Name: 0, dtype: object


# Normalization and Lemmatization

In [27]:
from nltk.tag import pos_tag    # Part-of-speech tagger

classification_train['review'] = [pos_tag(p) for p in classification_train['review']]
classification_test['review'] = [pos_tag(p) for p in classification_test['review']]

# Example review tagged
print(classification_train.loc[0])
print(classification_test.loc[0])


# LEMMANIZATION

from nltk.stem.wordnet import WordNetLemmatizer

# All we need is to know the type (Noun, Verb, or others) of each word
def _tag2type(tag):
    '''
    Take a tag and return a type.
    return 'n' for noun, 'v' for verb, and 'a' for any
    '''
    if tag.startswith('NN'):
        return 'n'
    elif tag.startswith('VB'):
        return 'v'
    else:
        return 'a'

lemmatizer = WordNetLemmatizer()

classification_train['review'] = [[lemmatizer.lemmatize(word, _tag2type(tag)) for (word, tag) in tags] for tags in classification_train['review']]
classification_test['review'] = [[lemmatizer.lemmatize(word, _tag2type(tag)) for (word, tag) in tags] for tags in classification_test['review']]

# Example review
print(classification_train.loc[0])
print(classification_test.loc[0])

review       [(bromwell, RB), (high, JJ), (is, VBZ), (a, DT...
sentiment                                                    1
Name: 0, dtype: object
review       [(i, JJ), (went, VBD), (and, CC), (saw, VBD), ...
sentiment                                                    1
Name: 0, dtype: object
review       [bromwell, high, be, a, cartoon, comedy, it, r...
sentiment                                                    1
Name: 0, dtype: object
review       [i, go, and, saw, this, movie, last, night, af...
sentiment                                                    1
Name: 0, dtype: object


## Noise reduction

In [28]:
# NOISE REDUCTION

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# print
stopwords[:10]

import re
from string import punctuation

def _is_noise(word):
    pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(@[A-Za-z0-9_]+)'
    return word in punctuation \
        or word.lower() in stopwords \
        or re.search(pattern, word, re.IGNORECASE) != None

classification_train['review'] = [[p.lower() for p in _list if not _is_noise(p)] for _list in classification_train['review']]
classification_test['review'] = [[p.lower() for p in _list if not _is_noise(p)] for _list in classification_test['review']]

# Example review
print(classification_train.loc[0])
print(classification_test.loc[0])

review       [bromwell, high, cartoon, comedy, run, time, p...
sentiment                                                    1
Name: 0, dtype: object
review       [go, saw, movie, last, night, coax, friend, mi...
sentiment                                                    1
Name: 0, dtype: object


In [29]:
reviews_train = classification_train['review']
sentiments_train = classification_train['sentiment']

reviews_test = classification_test['review']
sentiments_test = classification_test['sentiment']

save_train = classification_train.copy()
save_test = classification_test.copy()

## Preparing tran and test datasets

In [30]:
def get_tweets_for_model(tokens_list):
    '''
    Generator function that associates a boolean 'True' to each token in a list of tokens,
    which represents the label of each token.
    This step is required by the NLTK classifier we'll be using:
    - Documentation: https://www.nltk.org/book/ch06.html
    
    @arg tokens_list a 2-D list of (preferably cleaned) tokens
    @return A 2-D list of tuples (original_token, True) containing the unaltered token and a boolean label
    '''
    for tweet_tokens in tokens_list:
        yield dict([token, True] for token in tweet_tokens)

tokens_for_model_train = get_tweets_for_model(reviews_train)
tokens_for_model_test = get_tweets_for_model(reviews_test)

print(tokens_for_model_train)
print(tokens_for_model_test)

<generator object get_tweets_for_model at 0x000001FDCB043E48>
<generator object get_tweets_for_model at 0x000001FDD750D9C8>


In [31]:
# Convert 1/0 to Positive/Negative
def get_sentiment(sentiment):
    if sentiment == 1:
        return "Positive"
    else:
        return "Negative"

# Append tokens_for_model to sentiment value
count = 0
prepared_model_train = []
for value in tokens_for_model_train:
    prepared_model_train.append((value, sentiments_train[count]))
    count = count + 1
    
count = 0
prepared_model_test = []
for value in tokens_for_model_test:
    prepared_model_test.append((value, sentiments_test[count]))
    count = count + 1
    
print(prepared_model_train[0])
print(prepared_model_test[0])

({'bromwell': True, 'high': True, 'cartoon': True, 'comedy': True, 'run': True, 'time': True, 'program': True, 'school': True, 'life': True, 'teacher': True, '35': True, 'year': True, 'teaching': True, 'profession': True, 'lead': True, 'believe': True, 'satire': True, 'much': True, 'close': True, 'reality': True, 'scramble': True, 'survive': True, 'financially': True, 'insightful': True, 'student': True, 'see': True, 'right': True, 'pathetic': True, 'pomp': True, 'pettiness': True, 'whole': True, 'situation': True, 'remind': True, 'knew': True, 'saw': True, 'episode': True, 'repeatedly': True, 'try': True, 'burn': True, 'immediately': True, 'recall': True, 'classic': True, 'line': True, 'inspector': True, 'sack': True, 'one': True, 'welcome': True, 'expect': True, 'many': True, 'adult': True, 'age': True, 'think': True, 'far': True, 'fetch': True, 'pity': True}, 1)
({'go': True, 'saw': True, 'movie': True, 'last': True, 'night': True, 'coax': True, 'friend': True, 'mine': True, 'admit'

## Train and Test

In [38]:
train_data = prepared_model_train
test_data = prepared_model_test

from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_data)

# Training Accuracy
print("Train Accuracy: {}\n".format(classify.accuracy(classifier, train_data)))
# Testing Accuracy
print("Test Accuracy: {}\n".format(classify.accuracy(classifier, test_data)))

Train Accuracy: 0.94712

Test Accuracy: 0.83708



In [33]:
def classify(tweet):
    '''
    Wrapper function for the pre-processing and classification steps previously performed.
    
    @arg tweet: String representing a tweet
    @return String representing a polarity. (Positive or Negative)
    '''
    tokens = tweet_tokenizer.tokenize(tweet)
    tokens = [
        lemmatizer.lemmatize(word, _tag2type(tag)).lower()
        for word, tag in pos_tag(tokens)
        if not _is_noise(word)
    ]
    
    return tokens, classifier.classify(dict([token, True] for token in tokens))

## Manual test

In [36]:
example_tweet = "this movie was a complete waste of time"
tokens, polarity = classify(example_tweet)

print("Denoised tokens: {}\nPolarity: {}\n".format(tokens, polarity))

Denoised tokens: ['movie', 'complete', 'waste', 'time']
Polarity: 0



# Deep learning (LSTM with tensorfow and keras)

* In this fhase, we are going to write a few model arquitectures to decide what is the best aproach

* All the models have at least 3 layers:
    * The first one is always an embedding layer to create the reviews embedding matrix to feed de network
    * The middle layer is an Reccurent layer (LSTM/GRU) with different numbers of nodes for test porpuses (32, 64, etc,...)
    * The last layer is a FC layer with one output node with sigmoidal activation
    
    
* The loss function, the optimizer and the selected metric were the same in all aproaches to compare results

# Aproach 1

* Add the Dropout layer in the middle of the main layers to prevent train data overfit

## Build Model

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Dropout, Bidirectional
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100

# model.add(Bidirectional(LSTM(32, dropout=0.2)))

print('Buil Model...')

EMBED_DIM = 32

model_1 = Sequential()
model_1.add(Embedding(input_dim=total_words, output_dim=EMBED_DIM, input_length=max_length))
model_1.add(Dropout(0.2))
model_1.add(GRU(32, dropout=0.2))
model_1.add(Dense(units=1000, activation='relu'))
model_1.add(Dropout(0.2))
model_1.add(Dense(units=1, activation='sigmoid'))
model_1.summary()
model_1.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])



Buil Model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 118, 32)           4263552   
_________________________________________________________________
dropout (Dropout)            (None, 118, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 1000)              33000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 1001      
Total params: 4,305,873
Trainable params: 4,305,873
Non-trainable params: 0
________________________________

# Aproach 2

* Simple aproach with GRU layer instead of LSTM

## Build model

In [44]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Dropout
from keras.layers.embeddings import Embedding

EMBED_DIM = 32
LSTM_OUT = 64

model_2 = Sequential()
model_2.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model_2.add(LSTM(LSTM_OUT, dropout=0.2))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model_2.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 118, 32)           6628864   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 6,653,761
Trainable params: 6,653,761
Non-trainable params: 0
_________________________________________________________________
None


## Aproach 3

* LSTM simple aproach with some changes in the parameters

### Build model

In [22]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Dropout
from keras.layers.embeddings import Embedding

EMBED_DIM = 100

model_3 = Sequential()

model_3.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model_3.add(LSTM(200, dropout=0.2))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model_3.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 118, 100)          13323600  
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               240800    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 201       
Total params: 13,564,601
Trainable params: 13,564,601
Non-trainable params: 0
_________________________________________________________________
None


## Aproach 4

* This aproach adds layers usually used in CNN to discover some relevant features that could make batter classification

### Buil model

In [93]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Dropout, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding

EMBED_DIM = 32

model_4 = Sequential()

model_4.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model_4.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_4.add(MaxPooling1D(pool_size=2))
model_4.add(LSTM(100))
model_4.add(Dense(1, activation='sigmoid'))

model_4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 118, 32)           4263552   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 118, 32)           3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 59, 32)            0         
_________________________________________________________________
lstm_26 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_52 (Dense)             (None, 1)                 101       
Total params: 4,319,957
Trainable params: 4,319,957
Non-trainable params: 0
_________________________________________________________________
None


## Train and Test one of the models

* The second model (Aproach 2) is the model with the best accurary so we are going to use that one

In [45]:
model = model_2

### Train model

In [46]:
model.fit(X_train, y_train, batch_size = 128, epochs = 5)

# model.save('./')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1fde9c928c8>

In [7]:
from tensorflow import keras

model = keras.models.load_model('./')

### Test model

* We assume for test porposes that if the result of the output layer (between 0 and 1) is grater tha 0,5 than the review as a positive senitment, otherwise negative

In [47]:
# y_pred = model.predict_classes(X_test, batch_size = 128)

def test():
    y_pred = (model.predict(X_test, batch_size = 128) > 0.5).astype("int32")

    true = 0
    for i, y in enumerate(y_test):
        if y == y_pred[i]:
            true += 1

    print('Correct Prediction: {}'.format(true))
    print('Wrong Prediction: {}'.format(len(y_pred) - true))
    print('Accuracy: {}'.format(true/len(y_pred)*100))
    
test()

Correct Prediction: 20827
Wrong Prediction: 4173
Accuracy: 83.308


## Manual test

* Run this so you can write an review an see what is the senitment of that review (IMDB review)

In [169]:
review=input()
data = [[review, 1]]

  
# Create the pandas DataFrame
type_test = pd.DataFrame(data, columns = ['review', 'sentiment'])

X_type_test, y_type_test = load_dataset(type_test)

token = Tokenizer(lower=False)
token.fit_on_texts(X_type_test)
X_type_test = token.texts_to_sequences(X_type_test)
X_type_test = pad_sequences(X_type_test, maxlen=max_length, padding='post')

y_pred = (model.predict(X_type_test, batch_size = 128) > 0.5).astype("int32")


print(model.predict(X_type_test, batch_size = 128))


if y_pred[0][0] == 0:
    print('Negative sentiment')
else:
    print('Positive sentiment')



 one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the worst movies i ever seen in my life one of the w

[[0.4068194]]
Negative sentiment


## Twitter reviews

* The next step is to use our model to classify twitter movie reviews but the model was trained in imdb review (usually bigger, diferent format, ect,...)

1. Create test dataset with only the reviews that have 250 words or less to check the results 

2. To test the model if it gives good results with twiiter reviwes, we teste with nltk library
    * The nltk ussualy used to preprocessing, has a model of twitter reviwes, so we creare a test set with 1000 positive and 1000 negative reviews to evaluate the model

### Create test dataset with only the reviews that have 250 words or less

* To test the model behavior with reviews that look like Twitter reviews

In [None]:
import pandas as pd

def get_small_reviews_dataframe(frame):
    data = {'review': [], 'sentiment': []}

    aux = pd.DataFrame(data)


    for i in range(0, len(frame['review']) ):
        if len(frame.loc[i, 'review']) < 250:
            new_row = {'review': frame.loc[i, 'review'], 'sentiment': int(frame.loc[i, 'sentiment'])}
            aux = aux.append(new_row, ignore_index=True)

    return aux

small_train = small_train.iloc[0:0]
small_test = small_test.iloc[0:0]
small_train = get_small_reviews_dataframe(train)
small_test = get_small_reviews_dataframe(test)

print(small_train)
print(small_test)

test = test.iloc[0:0]
test = pd.concat([small_train, small_test], ignore_index=True)
# test = small_test

print(test)

## Test with Nltk twitter reviews

In [90]:
import nltk
from nltk.corpus import movie_reviews

# nltk.download('movie_reviews')


def create_word_features(words):
    useful_words = [word for word in words if word not in stopwords.words("english")]
    
twitter_data_pos = pd.DataFrame(data, columns = ['review', 'sentiment'])

for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    row = {'review': ' '.join(list(words)), 'sentiment': 1}
    twitter_data_pos = twitter_data_pos.append(row, ignore_index=True)
    
twitter_data_neg = pd.DataFrame(data, columns = ['review', 'sentiment'])

for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    row = {'review': ' '.join(list(words)), 'sentiment': 0}
    twitter_data_neg = twitter_data_neg.append(row, ignore_index=True)

twitter_data = pd.DataFrame(data, columns = ['review', 'sentiment'])

twitter_data = pd.concat([twitter_data_pos, twitter_data_neg], ignore_index=True) 
print(twitter_data)



                                                review  sentiment
0    films adapted from comic books have had plenty...        1.0
1    every now and then a movie comes along from a ...        1.0
2    you ' ve got mail works alot better than it de...        1.0
3    " jaws " is a rare film that grabs your attent...        1.0
4    moviemaking is a lot like being the general ma...        1.0
..                                                 ...        ...
995  wow ! what a movie . it ' s everything a movie...        1.0
996  richard gere can be a commanding actor , but h...        1.0
997  glory -- starring matthew broderick , denzel w...        1.0
998  steven spielberg ' s second epic film on world...        1.0
999  truman ( " true - man " ) burbank is the perfe...        1.0

[1000 rows x 2 columns]
                                                review  sentiment
0    plot : two teen couples go to a church party ,...        0.0
1    the happy bastard ' s quick movie review damn 

### Nltk twitter results

In [None]:
X_test, y_test = load_dataset(twitter_data)

tokenize_padd()

test()

## Now the real twitter api

In [48]:

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ').lower()
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes:
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def strip_all(text):
    #text = text.replace('  ', ' ')
    #text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    return text


In [49]:
import os
import tweepy as tw
import pandas as pd
import string

def get_twitter_reviews(movie_name):
    consumer_key=''
    consumer_secret=''
    access_token='1166798786-'
    access_token_secret=''


    auth = tw.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tw.API(auth, wait_on_rate_limit=True)

    search_words = "#" + movie_name + " -filter:retweets"
    print(search_words)
    date_since = "2018-01-01"

    tweets = tw.Cursor(api.search,
                  tweet_mode="extended",  
                  q=search_words,
                  lang="en",
                  since=date_since).items(500)

    count = 0
    data = {'review': [], 'sentiment': []}
    twitter_api_data = pd.DataFrame(data, columns = ['review'])


    clear_tweets = []

    for tweet in tweets:
        row = {'review': strip_all(strip_all_entities(strip_links(tweet.full_text))), 'sentiment': 0}
        twitter_api_data = twitter_api_data.append(row, ignore_index=True)
    #     clear_tweets.append([strip_all(strip_all_entities(strip_links(tweet.full_text)))])


    return twitter_api_data


In [50]:
def rank_movie(movie_name):
    print(get_twitter_reviews(movie_name))
    X_twitter_test, y_twitter_test = load_dataset(get_twitter_reviews(movie_name))
    
    rank = 0

    token = Tokenizer(lower=False)
    token.fit_on_texts(X_twitter_test)
    X_twitter_test = token.texts_to_sequences(X_twitter_test)
    X_twitter_test = pad_sequences(X_twitter_test, maxlen=max_length, padding='post')

    y_pred = (model.predict(X_twitter_test, batch_size = 128) > 0.5).astype("int32")

    neg = 0
    pos = 0

    for review in y_pred:
        if review[0] == 0:
            neg = neg + 1
        if review[0] == 1:
            pos = pos + 1

    rank = 10 * pos / (pos+neg)
    print('negative review', neg)
    print('positive review', pos)
    return rank


## Create server to connect with FE

In [None]:
import flask as flask
from flask import Flask, jsonify, json, request
from flask_restful import Api, Resource, reqparse
from flask_cors import CORS
import random
app = Flask(__name__)
CORS(app)
api = Api(app)

class Api(Resource):
    
    @app.route("/movie", methods=["POST"])
    def movie():
        parser = reqparse.RequestParser()
        parser.add_argument("movie")
        params = parser.parse_args()


        print(params['movie'])
        rank = rank_movie(params['movie'].replace(" ", ""))
        resp = flask.Response('{"rank":' + str(rank) + '}')
#         resp = flask.Response(str(params['movie']))
        resp.headers['Content-Type'] = 'text/plain'
        resp.headers['Access-Control-Allow-Origin'] = 'POST'
        return resp

    if __name__ == '__main__':
        from werkzeug.serving import run_simple
        run_simple('localhost', 9000, app)

 * Running on http://localhost:9000/ (Press CTRL+C to quit)
127.0.0.1 - - [26/Apr/2021 20:53:39] "[37mOPTIONS /movie HTTP/1.1[0m" 200 -


justice league
#justiceleague -filter:retweets
                                                review  sentiment
0    its all connected how the arrowverse fulfilled...        0.0
1    i finally was able to sit and take a photo of ...        0.0
2    in there i was understood oswald hides his pai...        0.0
3    dc comics has announced justice league infinit...        0.0
4    Not gonna lie It wasn t good On the other hand...        0.0
..                                                 ...        ...
495                 we fight for https t co s5xiagq0m5        0.0
496                     this song goes with everything        0.0
497       wonder woman cosplayer https t co k28bu1dxkg        0.0
498               here are amp spoiler free reviews of        0.0
499  love lego games by while waiting for check out...        0.0

[500 rows x 2 columns]
#justiceleague -filter:retweets


127.0.0.1 - - [26/Apr/2021 20:54:22] "[37mPOST /movie HTTP/1.1[0m" 200 -


negative review 54
positive review 446
