In [1]:
import collections
import os
from random import shuffle
import pprint
import nltk
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''
    count_vectorizer = CountVectorizer(
        tokenizer=nltk.word_tokenize,
        # Can use nltk.corpus.stopwords.words('english')
        stop_words='english',
        min_df=1, # The word must appear more than once
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    # Can change params of TfidfTransformer if it lowers performance
    processed_corpus = TfidfTransformer().fit_transform(processed_corpus)
    
    return processed_corpus

In [3]:
data_directory = 'txt_sentoken\movie_reviews'

movie_sentiment_data = load_files(data_directory, shuffle=True)
print(f'{len(movie_sentiment_data.data)} files loaded.')
print(f'They contain the following classes: {movie_sentiment_data.target_names}')

2000 files loaded.
They contain the following classes: ['neg', 'pos']


  data_directory = 'txt_sentoken\movie_reviews'


In [4]:
# Create the tf-idf matrix
movie_tfidf = extract_features(movie_sentiment_data.data)



In [5]:
# Create the test train split
# X_train, X_test, y_train, y_test = train_test_split(
#     movie_tfidf, movie_sentiment_data.target, test_size=0.3, random_state=42)
# Data already shuffled, so keep in same order to compare predictions
X_train, X_test, y_train, y_test = train_test_split(
    movie_tfidf, movie_sentiment_data.target, test_size=0.3, shuffle=False)

In [14]:
len(y_train)

1400

In [6]:
# Create the linear model
model = LogisticRegression()
model.fit(X_train, y_train)
print(f'Model performance: {model.score(X_test, y_test)}')

Model performance: 0.7783333333333333


In [16]:
# Show some predictions
y_pred = model.predict(X_test)
sample = 5
train_len = len(y_train)
for i, j in zip(range(sample), range(train_len, train_len+sample)):
    pprint.pprint(f'Review:\n{movie_sentiment_data.data[i]}\n\nCorrect label: {y_test[i]}  Prediction: {y_pred[i]}')

('Review:\n'
 'b"arnold schwarzenegger has been an icon for action enthusiasts , since the '
 "late 80's , but lately his films have been very sloppy and the one-liners "
 "are getting worse . \\nit's hard seeing arnold as mr . freeze in batman and "
 'robin , especially when he says tons of ice jokes , but hey he got 15 '
 "million , what's it matter to him ? \\nonce again arnold has signed to do "
 "another expensive blockbuster , that can't compare with the likes of the "
 'terminator series , true lies and even eraser . \\nin this so called dark '
 'thriller , the devil ( gabriel byrne ) has come upon earth , to impregnate a '
 'woman ( robin tunney ) which happens every 1000 years , and basically '
 'destroy the world , but apparently god has chosen one man , and that one man '
 'is jericho cane ( arnold himself ) . \\nwith the help of a trusty sidekick ( '
 'kevin pollack ) , they will stop at nothing to let the devil take over the '
 'world ! \\nparts of this are actually so abs

In [12]:
for i in range(5):
    pprint.pprint(movie_sentiment_data.data[i])
    pprint.pprint(movie_sentiment_data.target[i])

(b'arnold schwarzenegger has been an icon for action enthusiasts , since the la'
 b"te 80's , but lately his films have been very sloppy and the one-liners are "
 b"getting worse . \nit's hard seeing arnold as mr . freeze in batman and ro"
 b'bin , especially when he says tons of ice jokes , but hey he got 15 million '
 b", what's it matter to him ? \nonce again arnold has signed to do another "
 b"expensive blockbuster , that can't compare with the likes of the terminator "
 b'series , true lies and even eraser . \nin this so called dark thriller , '
 b'the devil ( gabriel byrne ) has come upon earth , to impregnate a woman ( ro'
 b'bin tunney ) which happens every 1000 years , and basically destroy the worl'
 b'd , but apparently god has chosen one man , and that one man is jericho cane'
 b' ( arnold himself ) . \nwith the help of a trusty sidekick ( kevin pollac'
 b'k ) , they will stop at nothing to let the devil take over the world ! \n'
 b'parts of this are actually so absurd , t