In [42]:
import numpy as np
import os
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from enum import Enum
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leechilvers/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leechilvers/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leechilvers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
# Get the positive and negative reviews
def get_reviews_in(path):
    review_file_pattern = re.compile('\d+_\d+.txt')
    files = os.listdir(path)
    # get the review file names
    review_files = [file for file in files if review_file_pattern.match(file)] 
    # get the content of each review file
    reviews = []
    for review_file in review_files:
        review_file_path = os.path.join(path, review_file)
        with open(review_file_path, 'r') as content:
            reviews.append(content.read())

    return reviews

# x data
pos_reviews = get_reviews_in('film_reviews/pos')
neg_reviews = get_reviews_in('film_reviews/neg')
reviews = pos_reviews + neg_reviews

# y data
pos_labels = [0] * len(pos_reviews)
neg_labels = [1] * len(neg_reviews)
labels = pos_labels + neg_labels

Data Splits

In [39]:
# split the data into train/dev/test splits
# 80% train and 20% test
x_train, x_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)
# split test into 50% dev and 50% test
x_dev, x_test, y_dev, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [44]:
class NormTech(Enum):
    FREQ_NORM = 1
    TF_IDF = 2
    PPMI = 3

class MyFeatureGenerator:

    def generateFeatures(self, reviews, 
                         is_lemmatisation = True, 
                         is_lowercase = True, 
                         is_stopwords_removed = True,
                         is_punctuation_removed = False, 
                         n_gram_len = 1, 
                         normalisation_technique = NormTech.FREQ_NORM
                         ):
        # tokenise each review: ['this', 'is', 'a', 'review']
        reviews = self.__tokenise(reviews, is_punctuation_removed, is_stopwords_removed, is_lowercase)

        # do lemmatisation or stemming: ['this', 'is', 'a', 'review']
        reviews = self.__lemmatise(reviews) if is_lemmatisation else self.__stem(reviews)

        # generate n_grams: [[('this', 'is', 'a'), ('is', 'a', 'review'), ('a', 'review', None)]]
        reviews = self.__n_gram(reviews, n_gram_len)

        # normalise
        if (normalisation_technique == NormTech.FREQ_NORM):
            reviews = self.__freq_norm(reviews)
        elif (normalisation_technique == NormTech.TF_IDF):
            reviews = self.__tf_idf(reviews)
        elif (normalisation_technique == NormTech.PPMI):
            reviews = self.__ppmi(reviews)

        return reviews

    def __tokenise(self, reviews, is_punctuation_removed, is_stopwords_removed, is_lowercase):
        tokenised_reviews = []
        # tokenise each review using nltk
        remove_punc_tokeniser = nltk.RegexpTokenizer('\w+')
        stop_words = set(stopwords.words('english'))

        for review in reviews:
            # generate tokens with/out punctuation 
            tokens = remove_punc_tokeniser.tokenize(review) if (is_punctuation_removed) else nltk.word_tokenize(review)
            tokenised_review = []
            for token in tokens:
                # ignoring stopwords
                if (is_stopwords_removed and token.lower() in stop_words):
                        continue
                
                # converting to lowercase
                result_token = token.lower() if (is_lowercase) else token
                
                tokenised_review.append(result_token)
            tokenised_reviews.append(tokenised_review)
        
        return tokenised_reviews
                        
    def __lemmatise(self, reviews):
        lemmatised_reviews = []
        # do lemmatisation on tokenised reviews
        lemmatiser = nltk.WordNetLemmatizer()
        for review in reviews:
            # review is ['a', 'list', 'of', 'words']
            lemmatised_reviews.append([lemmatiser.lemmatize(token) for token in review])
        
        return lemmatised_reviews
    
    def __stem(self, reviews):
        stemmed_reviews = []
        # do stemming on tokenised reviews
        stemmer = nltk.PorterStemmer()
        for review in reviews:
            # review is ['a', 'list', 'of', 'words']
            stemmed_reviews.append([stemmer.stem(token) for token in review])
        
        return stemmed_reviews
    
    def __n_gram(self, reviews, n):
        # default to 1 if not valid length
        if (n < 1): n = 1
        # generate n-gram on processed reviews where
        # a review is ['a', 'list', 'of', 'words']
        return [nltk.ngrams(review, n) for review in reviews]
    
    def __freq_norm(self, reviews):
        return reviews
    
    def __tf_idf(self, reviews):
        return reviews
    
    def __ppmi(self, reviews):
        return reviews

feature_generator = MyFeatureGenerator()
feature_generator.generateFeatures(x_train, n_gram_len=3)

[<generator object ngrams at 0x7fe31c5a5dd0>,
 <generator object ngrams at 0x7fe31c5a5f20>,
 <generator object ngrams at 0x7fe31c3c3ac0>,
 <generator object ngrams at 0x7fe31c3c3f90>,
 <generator object ngrams at 0x7fe31c3c33c0>,
 <generator object ngrams at 0x7fe31c3c3eb0>,
 <generator object ngrams at 0x7fe31c3c3ba0>,
 <generator object ngrams at 0x7fe31c3c3430>,
 <generator object ngrams at 0x7fe31c3c3cf0>,
 <generator object ngrams at 0x7fe31c3c36d0>,
 <generator object ngrams at 0x7fe31c3c3c10>,
 <generator object ngrams at 0x7fe31c3c3b30>,
 <generator object ngrams at 0x7fe31c3c3a50>,
 <generator object ngrams at 0x7fe31c3c3900>,
 <generator object ngrams at 0x7fe31c3ba3c0>,
 <generator object ngrams at 0x7fe31c3ba740>,
 <generator object ngrams at 0x7fe31c3ba6d0>,
 <generator object ngrams at 0x7fe31c3ba660>,
 <generator object ngrams at 0x7fe31c3ba2e0>,
 <generator object ngrams at 0x7fe31c3ba200>,
 <generator object ngrams at 0x7fe31c3ba040>,
 <generator object ngrams at 0x7fe

Naive Bayes

In [7]:
""" evaluate at least three feature sets with classifier """

class MyNaiveBayesClassifier:
    # for sentiment analysis there will be 3 classes
    class_count = 0

    def get_prior_probability(self, labels):
        # movie review labels is [0(pos), 1(neg), 2(neu), 0, 2, 0, ...]
        # array of p(class) where index is class
        prior_probs = np.zeros(max(labels) + 1)
        self.class_count = len(prior_probs)
        for label in labels:
            prior_probs[label] += 1/len(labels) #TODO: log(1/len(labels))
        
        return prior_probs