In [1]:
import nltk
import random
import numpy as np
from datetime import datetime as dt
import os
import glob
cwd = os.getcwd()

### Example Process

##### Setup

In [2]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories()\
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents) # Setting up the documents list that contains: All documents from the movie_reviews
# List will contain [document1, document2, document3...] where document1 contains [review text, pos or neg review]

In [3]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) # all words in order similar to document list

##### Feature Extractor

In [4]:
word_features = list(all_words)[:2000] # 2000 most common words in all movie_review corpus

def document_features(document): # define feature extractor
    document_words = set(document) # builds an iterable from document (likely just a list/set of words from the movie_review)
                                    # sets are very quick to check (word in document_words) compared to checking lists
    features = {} # dictionary
    for word in word_features: # Iterate through all 2000 most common words in the corpus
        features['contains({})'.format(word)] = (word in document_words) # create dictionary index with value TRUE or FALSE
                                                                        # Depending on if word is in document_words
    return features

In [5]:
print(list(document_features(movie_reviews.words('pos/cv957_8737.txt')).items())[:10]) # test

[('contains(,)', True), ('contains(the)', True), ('contains(.)', True), ('contains(a)', True), ('contains(and)', True), ('contains(of)', True), ('contains(to)', True), ("contains(')", True), ('contains(is)', True), ('contains(in)', True)]


##### Training

In [6]:
featuresets = [(document_features(d),c) for (d,c) in documents] # contains 2000 word existences in movie review and whether it was
                                                    # negative or positive
# structured as [document1, document2, document3, ...] where document1 is structured as [contains words, neg or pos]

In [7]:
featuresets[0]

({'contains(,)': True,
  'contains(the)': True,
  'contains(.)': True,
  'contains(a)': True,
  'contains(and)': True,
  'contains(of)': True,
  'contains(to)': True,
  "contains(')": True,
  'contains(is)': True,
  'contains(in)': True,
  'contains(s)': True,
  'contains(")': True,
  'contains(it)': True,
  'contains(that)': True,
  'contains(-)': True,
  'contains())': True,
  'contains(()': True,
  'contains(as)': True,
  'contains(with)': True,
  'contains(for)': True,
  'contains(his)': True,
  'contains(this)': True,
  'contains(film)': True,
  'contains(i)': True,
  'contains(he)': True,
  'contains(but)': True,
  'contains(on)': True,
  'contains(are)': True,
  'contains(t)': False,
  'contains(by)': True,
  'contains(be)': True,
  'contains(one)': True,
  'contains(movie)': False,
  'contains(an)': True,
  'contains(who)': True,
  'contains(not)': True,
  'contains(you)': False,
  'contains(from)': True,
  'contains(at)': True,
  'contains(was)': True,
  'contains(have)': True

In [8]:
train_set,test_set = featuresets[:int(len(featuresets)*0.8)],featuresets[int(len(featuresets)*0.8):] 
# 80 20 split for test and train

In [9]:
classifier = nltk.NaiveBayesClassifier.train(train_set) # train using built in

In [10]:
print(nltk.classify.accuracy(classifier, test_set)) # see the accuracy of the program

0.8075


In [11]:
classifier.show_most_informative_features(5) # 5 most influential features

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.3 : 1.0
        contains(seagal) = True              neg : pos    =      7.1 : 1.0
         contains(damon) = True              pos : neg    =      6.9 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.7 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.3 : 1.0


### Real Data

In [12]:
%%time
texts = []
files_90s = glob.glob(cwd+"\\90s\\*.txt")
for i,file in enumerate(files_90s):
    try:
        file_sub = open(file, "r")
        text = file_sub.read()
        file_sub.close()
        
    except UnicodeDecodeError:
        continue
    tokens = nltk.word_tokenize(text)
    tokens = tokens[int(len(tokens)*.1):int(len(tokens)*.9)]
    j = 0
    while j < len(tokens):
        if(not(tokens[j].lower().isalpha())):
            del(tokens[j])
            j-=1
        j+=1
    texts.append((tokens,"90s"))
files_10s = glob.glob(cwd+"\\10s\\*.txt")
for i,file in enumerate(files_10s):
    try:
        file_sub = open(file, "r")
        text = file_sub.read()
        file_sub.close()
    except UnicodeDecodeError:
        continue
    tokens = nltk.word_tokenize(text)
    tokens = tokens[int(len(tokens)*.1):int(len(tokens)*.9)]
    j = 0
    while j < len(tokens):
        if(not(tokens[j].lower().isalpha())):
            del(tokens[j])
            j-=1
        j+=1
    texts.append((tokens,"10s"))

    # if(i > 3):
    #     break
all_words = [x for xs in texts for x in xs[0]]
all_words = list(nltk.FreqDist(all_words))

CPU times: total: 1min 24s
Wall time: 3min 16s


In [13]:
len(all_words)

97310

In [14]:
word_features = list(all_words)[:2000]

def text_features(text): # define feature extractor
    document_words = set(text) # builds an iterable from document (likely just a list/set of words from the movie_review)
                                    # sets are very quick to check (word in document_words) compared to checking lists
    features = {} # dictionary
    for word in word_features: # Iterate through all 2000 most common words in the corpus
        features['contains({})'.format(word)] = (word in document_words) # create dictionary index with value TRUE or FALSE
                                                                        # Depending on if word is in document_words
    return features

In [15]:
print(list(text_features(texts[0][0]).items())[:10]) # test

[('contains(the)', True), ('contains(of)', True), ('contains(or)', True), ('contains(to)', True), ('contains(and)', True), ('contains(in)', True), ('contains(Lessee)', False), ('contains(a)', True), ('contains(shall)', False), ('contains(by)', True)]


### Training

In [16]:
featuresets = [(text_features(t),c) for (t,c) in texts]

In [17]:
train_set,test_set = featuresets[:int(len(featuresets)*0.8)],featuresets[int(len(featuresets)*0.8):] 
# 80 20 split for test and train

In [18]:
classifier = nltk.NaiveBayesClassifier.train(train_set) # train using built in

In [19]:
print(nltk.classify.accuracy(classifier, test_set)) # see the accuracy of the program

0.74235807860262


In [20]:
classifier.show_most_informative_features(-1) # 5 most influential features

Most Informative Features
       contains(Montana) = True              90s : 10s    =     78.9 : 1.0
          contains(YEAR) = True              90s : 10s    =     78.9 : 1.0
    contains(successors) = True              10s : 90s    =     73.9 : 1.0
     contains(hereunder) = True              10s : 90s    =     73.3 : 1.0
        contains(hereof) = True              10s : 90s    =     72.8 : 1.0
     contains(reworking) = True              10s : 90s    =     72.4 : 1.0
          contains(says) = True              90s : 10s    =     70.3 : 1.0
         contains(heirs) = True              10s : 90s    =     69.7 : 1.0
    contains(prosecuted) = True              10s : 90s    =     67.2 : 1.0
     contains(furnished) = True              10s : 90s    =     66.4 : 1.0
      contains(relieved) = True              10s : 90s    =     66.2 : 1.0
   contains(declaration) = True              10s : 90s    =     64.8 : 1.0
    contains(subsurface) = True              10s : 90s    =     64.7 : 1.0

In [47]:
file90 = cwd+"\\beholdapalehorsebymiltonwilliamcooper1991_202003\\Behold_a_Pale_Horse_by_Milton_William_Cooper_1991_djvu.txt"
file90_sub = open(file90, "r",encoding="utf8")
text90 = file90_sub.read()
file90_sub.close()
file10 = cwd+"\\logo_modernism\\Logo Modernism by Jens Müller_djvu.txt"
file10_sub = open(file10, "r",encoding="utf8")
text10 = file10_sub.read()
file10_sub.close()

In [48]:
text90 = nltk.word_tokenize(text90)
feature90 = text_features(text90)
text10 = nltk.word_tokenize(text10)
feature10 = text_features(text10)

In [50]:
classifier.classify_many([feature90,feature10])

['10s', '90s']

### Decision Tree

In [51]:
classifier2 = nltk.DecisionTreeClassifier.train(train_set)

In [52]:
print(nltk.classify.accuracy(classifier2, test_set)) # see the accuracy of the program

0.9170305676855895


In [56]:
print(classifier2)

contains(event)=False? ................................ 10s
  contains(Lessor)=False? ............................. 10s
    contains(realized)=False? ......................... 10s
      contains(Trades)=False? ......................... 10s
      contains(Trades)=True? .......................... 10s
    contains(realized)=True? .......................... 10s
      contains(West)=False? ........................... 10s
      contains(West)=True? ............................ 90s
  contains(Lessor)=True? .............................. 10s
contains(event)=True? ................................. 10s
  contains(status)=False? ............................. 10s
  contains(status)=True? .............................. 10s
    contains(ABS)=False? .............................. 10s
      contains(Service)=False? ........................ 10s
      contains(Service)=True? ......................... 90s
    contains(ABS)=True? ............................... 10s



In [57]:
classifier2.classify_many([feature90,feature10])

['90s', '90s']