In [1]:
import os
import re
from sklearn.model_selection import train_test_split
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# my classes
from preprocessor import MyPreprocessor
from feature_generator import NormTech
from feature_generator import MyFeatureGenerator
from naive_bayes import MyNaiveBayesClassifier

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leechilvers/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leechilvers/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leechilvers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Get the positive and negative reviews
def get_reviews_in(path):
    review_file_pattern = re.compile('\d+_\d+.txt')
    files = os.listdir(path)
    # get the review file names
    review_files = [file for file in files if review_file_pattern.match(file)] 
    # get the content of each review file
    reviews = []
    for review_file in review_files:
        review_file_path = os.path.join(path, review_file)
        with open(review_file_path, 'r') as content:
            reviews.append(content.read())

    return reviews

# x data
pos_reviews = get_reviews_in('film_reviews/pos')
neg_reviews = get_reviews_in('film_reviews/neg')
reviews = pos_reviews + neg_reviews

# y data
pos_labels = [0] * len(pos_reviews)
neg_labels = [1] * len(neg_reviews)
labels = pos_labels + neg_labels

In [3]:
# split the data into train/dev/test splits
# 80% train and 20% test
x_train, x_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)
# split test into 50% dev and 50% test
x_dev, x_test, y_dev, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

In [4]:
# lemmatisation, lowercase, stopwords removed, punctuation NOT removed, unigrams
preprocessor = MyPreprocessor()
train_revs = preprocessor.preprocess(x_train, n_gram_len=1)
dev_revs = preprocessor.preprocess(x_dev, n_gram_len=1)

# tf-idf
feature_generator = MyFeatureGenerator(preprocessor.vocab)
train_set_1 = feature_generator.generate_features(train_revs, NormTech.TF_IDF)
dev_set_1 = feature_generator.generate_features(dev_revs, NormTech.TF_IDF)

In [None]:
print(train_set_1.shape)
print(dev_set_1.shape)

In [None]:
# lemmatisation, lowercase, stopwords removed, punctuation NOT removed, bigrams
preprocessor = MyPreprocessor()
train_revs = preprocessor.preprocess(x_train, n_gram_len=2)
dev_revs = preprocessor.preprocess(x_dev, n_gram_len=2)

# tf-idf
feature_generator = MyFeatureGenerator(preprocessor.vocab)
train_set_2 = feature_generator.generate_features(train_revs, NormTech.TF_IDF)
dev_set_2 = feature_generator.generate_features(dev_revs, NormTech.TF_IDF)

In [5]:
clf = MultinomialNB()
clf.fit(train_set_1, y_train)
predictions = clf.predict(dev_set_1)
accuracy_score(y_dev, predictions)

0.63

In [11]:
my_clf = MyClassifier()
my_clf.train(train_set_1, y_train)
my_predictions = my_clf.predict(dev_set_1)
accuracy_score(y_dev, my_predictions)

0.7725

In [12]:
my_clf = MyNaiveBayesClassifier()
my_clf.fit(train_set_1, y_train)
my_predictions = my_clf.predict(dev_set_1)
accuracy_score(y_dev, my_predictions)

0.4775