In [1]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix, f1_score
import fasttext
import datetime
import csv
import nltk

# Read Dataset and Preprocessing 

## 3 dataset: IMDb, Rotten Tomato, Test dataset from Tieto

In [2]:
# Read the IMDb dataset
reviews_train = []
for line in open('../data/movie_data/full_train.txt', 'r'):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('../data/movie_data/full_test.txt', 'r'):
    
    reviews_test.append(line.strip())
    
target = [1 if i < 12500 else 0 for i in range(25000)]

In [3]:
# Test dataset from Tieto
path = '../data/movie_review_data'
classes = ['neg', 'pos']
labels    = []
test_data_mine = []
space = ' '
for j in range(len(classes)):
  file_list = os.listdir(path+'/'+classes[j])
  for i in file_list:
    labels.append(j)
    comment = open(path+'/'+classes[j]+'/'+i).read()
    comment = comment.replace('\n',' ')
    test_data_mine.append(comment)
# test_data = np.array(test_data)
len(test_data_mine)

1999

In [4]:
# Rotten Tomato Dataset, only training set has labels
rotten_tomato_train = pd.read_csv('../data/rottenTomatoes/train.tsv', sep = '\t')
print(rotten_tomato_train.head(10))
rotten_tomato_test  = pd.read_csv('../data/rottenTomatoes/test.tsv', sep = '\t')
rotten_tomato_train.drop_duplicates(subset = ['SentenceId'], keep='first', inplace = True)
rotten_tomato_test.drop_duplicates(subset = ['SentenceId'], keep='first', inplace = True)
rotten_tomato_train = rotten_tomato_train[~rotten_tomato_train['Sentiment'].isin([2])]
rotten_tomato_train['Label'] = rotten_tomato_train['Sentiment'].apply(lambda x: 1 if x>2 else 0)
rotten_tomato_train_x = list(rotten_tomato_train['Phrase'])
rotten_tomato_train_y = list(rotten_tomato_train['Label'])

   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   
5         6           1  of escapades demonstrating the adage that what...   
6         7           1                                                 of   
7         8           1  escapades demonstrating the adage that what is...   
8         9           1                                          escapades   
9        10           1  demonstrating the adage that what is good for ...   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  
5          2  
6          2  
7          2  
8     

In [5]:
# Preprocessing using regular expressions

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

# Replace the abbreviation with the complete words
def _replacer(text):
    replacement_patterns = [
        (r'won\'t', 'will not'),
        (r'can\'t', 'cannot'),
        (r'i\'m', 'i am'),
        (r'ain\'t', 'is not'),
        (r'(\w+)\'ll', r'\g<1> will'),
        (r'(\w+)n\'t', r'\g<1> not'),
        (r'(\w+)\'ve', r'\g<1> have'),
        (r'(\w+)\'s', r'\g<1> is'),
        (r'(\w+)\'re', r'\g<1> are'),
        (r'(\w+)\'d', r'\g<1> would')]
    patterns = [(re.compile(regex), repl) for (regex, repl) in replacement_patterns]
    s = text
    for (pattern, repl) in patterns:
        (s, _) = re.subn(pattern, repl, s)
    return s

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    reviews = [_replacer(line) for line in reviews]
    
    return reviews

In [6]:
reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)
rotten_tomato_clean = preprocess_reviews(rotten_tomato_train_x)

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/zhangjun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
def transform_instance(row, label):
    cur_row = []
    #Prefix the index-ed label with __label__
    label = "__label__" + str(label)  
    cur_row.append(label)
    cur_row.extend(nltk.word_tokenize(row.lower()))
    return cur_row

def preprocess_fasttext(input_file, labels, output_file):
    i=0
    with open(output_file, 'w') as csvoutfile:
        csv_writer = csv.writer(csvoutfile, delimiter=' ', lineterminator='\n')
        for i in range(len(input_file)):
            row_output = transform_instance(input_file[i],labels[i])
            csv_writer.writerow(row_output )
#             if i%10000 ==0:
#                 print('index: {}'.format(i))
#                 print(row_output)

X_train, X_val, y_train, y_val = train_test_split(
    reviews_train_clean, target, train_size = 0.75
)

preprocess_fasttext(X_train, y_train, '../data/fasttext/IMDb.train')
preprocess_fasttext(X_val, y_val, '../data/fasttext/IMDb.val')
preprocess_fasttext(reviews_test_clean, target, '../data/fasttext/IMDb.test')

In [9]:
hyper_params = {"lr": 0.1,
                "epoch": 15,
                "wordNgrams": 3,
                "dim": 256}     
                               
print(str(datetime.datetime.now()) + ' START!' )

# Train the model.
model = fasttext.train_supervised(input='../data/fasttext/IMDb.train', **hyper_params)
print("Model trained with the hyperparameter \n {}".format(hyper_params))

# CHECK PERFORMANCE
print(str(datetime.datetime.now()) + 'Training complete.' )
        
result = model.test('../data/fasttext/IMDb.train')
validation = model.test('../data/fasttext/IMDb.val')
test = model.test('../data/fasttext/IMDb.test')
        
# DISPLAY ACCURACY OF TRAINED MODEL
test_results = str("accuracy:" + str(result[1])  + ",    validation:" + str(validation[1]) + ",    test:" + str(test[1])+ '\n') 
print(test_results)

2020-01-27 13:18:23.980914 START!
Model trained with the hyperparameter 
 {'lr': 0.1, 'epoch': 15, 'wordNgrams': 3, 'dim': 256}
2020-01-27 13:18:45.366070Training complete.
accuracy:0.9644266666666667,    validation:0.88528,    test:0.88508



In [10]:
# Test with Tieto Data
test = preprocess_reviews(test_data_mine)
preprocess_fasttext(test, labels, '../data/fasttext/tieto.test')
model.test('../data/fasttext/tieto.test')


(1999, 0.8664332166083042, 0.8664332166083042)

In [11]:
X_train[0]

'this has to be the all time best computer animation classic even though most of the animations where experiments they have an artistic quality that has stood the test of time twelve years after it is release i have gone back to watch this video and found some inspiration for new types of computer graphics some of the techniques used in this video have never been full explored'

In [12]:
' '.join(transform_instance(X_train[0],target[0]))

'__label__1 this has to be the all time best computer animation classic even though most of the animations where experiments they have an artistic quality that has stood the test of time twelve years after it is release i have gone back to watch this video and found some inspiration for new types of computer graphics some of the techniques used in this video have never been full explored'