In [2]:
import nltk
import random
import numpy as np
from datetime import datetime as dt
import os
import glob
cwd = os.getcwd()

### Example Process

##### Setup

In [5]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories()\
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents) # Setting up the documents list that contains: All documents from the movie_reviews
# List will contain [document1, document2, document3...] where document1 contains [review text, pos or neg review]

In [17]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) # all words in order similar to document list

##### Feature Extractor

In [18]:
word_features = list(all_words)[:2000] # 2000 most common words in all movie_review corpus

def document_features(document): # define feature extractor
    document_words = set(document) # builds an iterable from document (likely just a list/set of words from the movie_review)
                                    # sets are very quick to check (word in document_words) compared to checking lists
    features = {} # dictionary
    for word in word_features: # Iterate through all 2000 most common words in the corpus
        features['contains({})'.format(word)] = (word in document_words) # create dictionary index with value TRUE or FALSE
                                                                        # Depending on if word is in document_words
    return features

In [28]:
print(list(document_features(movie_reviews.words('pos/cv957_8737.txt')).items())[:10]) # test

[('contains(,)', True), ('contains(the)', True), ('contains(.)', True), ('contains(a)', True), ('contains(and)', True), ('contains(of)', True), ('contains(to)', True), ("contains(')", True), ('contains(is)', True), ('contains(in)', True)]


##### Training

In [29]:
featuresets = [(document_features(d),c) for (d,c) in documents] # contains 2000 word existences in movie review and whether it was
                                                    # negative or positive
# structured as [document1, document2, document3, ...] where document1 is structured as [contains words, neg or pos]

In [39]:
featuresets[0]

({'contains(,)': True,
  'contains(the)': True,
  'contains(.)': True,
  'contains(a)': True,
  'contains(and)': True,
  'contains(of)': True,
  'contains(to)': True,
  "contains(')": True,
  'contains(is)': True,
  'contains(in)': True,
  'contains(s)': True,
  'contains(")': True,
  'contains(it)': True,
  'contains(that)': True,
  'contains(-)': True,
  'contains())': True,
  'contains(()': True,
  'contains(as)': True,
  'contains(with)': True,
  'contains(for)': True,
  'contains(his)': True,
  'contains(this)': True,
  'contains(film)': False,
  'contains(i)': True,
  'contains(he)': True,
  'contains(but)': True,
  'contains(on)': True,
  'contains(are)': True,
  'contains(t)': True,
  'contains(by)': True,
  'contains(be)': True,
  'contains(one)': True,
  'contains(movie)': True,
  'contains(an)': True,
  'contains(who)': True,
  'contains(not)': True,
  'contains(you)': True,
  'contains(from)': True,
  'contains(at)': False,
  'contains(was)': False,
  'contains(have)': True

In [45]:
train_set,test_set = featuresets[:int(len(featuresets)*0.8)],featuresets[int(len(featuresets)*0.8):] 
# 80 20 split for test and train

In [46]:
classifier = nltk.NaiveBayesClassifier.train(train_set) # train using built in

In [47]:
print(nltk.classify.accuracy(classifier, test_set)) # see the accuracy of the program

0.845


In [48]:
classifier.show_most_informative_features(5) # 5 most influential features

Most Informative Features
   contains(outstanding) = True              pos : neg    =     12.5 : 1.0
   contains(wonderfully) = True              pos : neg    =      8.2 : 1.0
        contains(seagal) = True              neg : pos    =      7.3 : 1.0
         contains(damon) = True              pos : neg    =      6.7 : 1.0
        contains(poorly) = True              neg : pos    =      6.3 : 1.0


### Real Data

In [21]:
texts = []
files_90s = glob.glob(cwd+"\\90s\\*.txt")
for i,file in enumerate(files_90s):
    try:
        text = open(file, "r").read()
    except UnicodeDecodeError:
        continue
    tokens = nltk.word_tokenize(text)
    tokens = tokens[int(len(tokens)*.1):int(len(tokens)*.9)]
    for j in tokens:
        try:
            num = int(j)
            if((num >= 1990) and (num <= 1999)):
                tokens.remove(j)
        except ValueError:
            continue
    texts.append((tokens,"90s"))
    if(i > 3):
        break
all_words = [x for ]