In [1]:
import collections
import os
import pprint
import nltk
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''
    count_vectorizer = CountVectorizer(
        tokenizer=nltk.word_tokenize,
        # Can use nltk.corpus.stopwords.words('english')
        stop_words='english',
        min_df=1, # The word must appear more than once
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    # Can change params of TfidfTransformer if it lowers performance
    processed_corpus = TfidfTransformer().fit_transform(processed_corpus)
    
    return processed_corpus

In [3]:
data_directory = 'txt_sentoken\movie_reviews'

movie_sentiment_data = load_files(data_directory, shuffle=True)
print(f'{len(movie_sentiment_data.data)} files loaded.')
print(f'They contain the following classes: {movie_sentiment_data.target_names}')

2000 files loaded.
They contain the following classes: ['neg', 'pos']


  data_directory = 'txt_sentoken\movie_reviews'


In [4]:
# Create the tf-idf matrix
movie_tfidf = extract_features(movie_sentiment_data.data)



In [5]:
# Create the test train split
# X_train, X_test, y_train, y_test = train_test_split(
#     movie_tfidf, movie_sentiment_data.target, test_size=0.3, random_state=42)

# Data already shuffled, so keep in same order to compare predictions
X_train, X_test, y_train, y_test = train_test_split(
    movie_tfidf, movie_sentiment_data.target, test_size=0.3, shuffle=False)

In [6]:
len(y_train)

1400

In [7]:
# Create the linear model
model = LogisticRegression()
model.fit(X_train, y_train)
print(f'Model performance: {model.score(X_test, y_test)}')

Model performance: 0.7783333333333333


In [8]:
# Show some predictions with original text
y_pred = model.predict(X_test)
sample = 5
train_len = len(y_train)
for i, j in zip(range(sample), range(train_len, train_len+sample)):
    pprint.pprint(f'Review:\n{movie_sentiment_data.data[j]}')
    print(f'Correct label: {y_test[i]}  Prediction: {y_pred[i]}')

('Review:\n'
 "b'ingredients : pouring rain , small flooded town , damn about to burst , "
 'bad guys going after millions of dollars \\nsynopsis : at one point in the '
 'story townsperson karen asks hero tom what happened to her church . \\nhe '
 'replies something like : " the church is flooded but at least the '
 "floodwaters put out the big fire . \\nwell , the fire wasn\\'t that bad , "
 'since , while the church was burning , looters apparently thought it was '
 'safe enough to break through all the priceless stained glass windows . " '
 '\\nin hard rain a small town is nearly deserted due to flooding . '
 "\\neveryone has had to evacuate because it\\'s raining , and now floodwaters "
 'are rising so high that buildings are being submerged and the nearby dam is '
 'about to break . \\nenter a working class smart-alecky new armored car '
 'driver named tom ( christian slater ) . \\nsuddenly his security truck '
 'carrying over three million dollars gets stuck on the flooded stree