In [1]:
import collections
import os
import nltk
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model, naive_bayes, ensemble

In [2]:
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''
    sa_stop_words = nltk.corpus.stopwords.words('english')
    # words that might invert a sentence's meaning so don't remove
    white_list = [
        'what', 'but', 'if', 'because', 'as', 'until', 'against',
        'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
        'further', 'then', 'once', 'here', 'there', 'why', 'how', 'all', 'any',
        'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
        'same', 'so', 'than', 'too', 'can', 'will', 'just', 'don', 'should']
    stop_words = [sw for sw in sa_stop_words if sw not in white_list]
    count_vectorizer = CountVectorizer(
        tokenizer=nltk.word_tokenize,
        stop_words=stop_words,
        min_df=2, # The word must appear more than once
        # The higher ngram range is, the higher the vector space and computing cost
        ngram_range=(1, 2), # Allows for 1 and 2 word combinations
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    # Can change params of TfidfTransformer if it lowers performance
    processed_corpus = TfidfTransformer().fit_transform(processed_corpus)
    
    return processed_corpus

In [3]:
data_directory = '../Section3/txt_sentoken/movie_reviews'

movie_sentiment_data = load_files(data_directory, shuffle=True, random_state=42)
print(f'{len(movie_sentiment_data.data)} files loaded.')
print(f'They contain the following classes: {movie_sentiment_data.target_names}')

# Create the tf-idf matrix
movie_tfidf = extract_features(movie_sentiment_data.data)

# Create the test train split
X_train, X_test, y_train, y_test = train_test_split(
    movie_tfidf, movie_sentiment_data.target, test_size=0.3, random_state=42)

2000 files loaded.
They contain the following classes: ['neg', 'pos']




In [4]:
# Instansiate multiple classifiers

clf1 = linear_model.LogisticRegression()
clf1.fit(X_train, y_train)
print(f'Logistic Regression performance: {clf1.score(X_test, y_test)}')

clf2 = linear_model.SGDClassifier()
clf2.fit(X_train, y_train)
print(f'SGD Classifier performance: {clf2.score(X_test, y_test)}')

clf3 = naive_bayes.MultinomialNB()
clf3.fit(X_train, y_train)
print(f'Multinomial NB performance: {clf3.score(X_test, y_test)}')

clf4 = naive_bayes.BernoulliNB()
clf4.fit(X_train, y_train)
print(f'Bernoulli NB performance: {clf4.score(X_test, y_test)}')

Logistic Regression performance: 0.7983333333333333
SGD Classifier performance: 0.8466666666666667
Multinomial NB performance: 0.8083333333333333
Bernoulli NB performance: 0.81


In [5]:
# Create a voting model
voting_model = ensemble.VotingClassifier(
    estimators=[('lr', linear_model.LogisticRegression()),
                ('sgd', linear_model.SGDClassifier()),
                ('mnb', naive_bayes.MultinomialNB()),
                ('bnb', naive_bayes.BernoulliNB())],
    # Hard is 1 vote per model, soft is votes based on confidence
    voting='hard')
voting_model.fit(X_train, y_train)
print(f'Voting model performance: {voting_model.score(X_test, y_test)}')

Voting model performance: 0.8233333333333334


In [7]:
# Create a voting model
voting_model2 = ensemble.VotingClassifier(
    estimators=[('lr', linear_model.LogisticRegression()),
                # ('sgd', linear_model.SGDClassifier()),
                ('mnb', naive_bayes.MultinomialNB()),
                ('bnb', naive_bayes.BernoulliNB())],
    # Hard is 1 vote per model, soft is votes based on confidence
    voting='soft') # SGD has no confidence for soft voting
voting_model2.fit(X_train, y_train)
print(f'Voting model2 performance: {voting_model2.score(X_test, y_test)}')

Voting model2 performance: 0.81
