In [0]:
from nltk.corpus import movie_reviews

In [0]:
movie_reviews.categories()

['neg', 'pos']

In [0]:
movie_reviews.fileids();     #1st 1000 are Neg, Last 1000 are Pos

In [0]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [0]:
documents = []
for category in movie_reviews.categories():
    for fileId in movie_reviews.fileids(category):
        documents.append( (movie_reviews.words(fileId),category) )

In [0]:
#For Easy Train Test Split
import random
random.shuffle(documents)
documents[:5]

[(['"', 'a', 'man', 'is', 'not', 'a', 'man', 'without', ...], 'pos'),
 (['when', 'i', 'saw', 'the', 'trailer', 'for', 'this', ...], 'pos'),
 (['this', 'remake', 'of', '"', 'la', 'cage', 'aux', ...], 'pos'),
 (['this', 'film', 'is', 'extraordinarily', 'horrendous', ...], 'neg'),
 (['well', 'arnold', 'has', 'completed', 'the', ...], 'neg')]

In [0]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [0]:
from nltk.corpus import wordnet
from nltk import pos_tag

def get_simpler_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [0]:
from nltk.corpus import stopwords
stops = stopwords.words('English')

import string
punctuations = list(string.punctuation)         

stops+=punctuations

In [0]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w,get_simpler_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [0]:
#Clean for all 2000 reviews in the document
documents = [ (clean_review(document),category) for document,category in documents]
documents[0]           #See Tail

In [0]:
#len(documents)

In [0]:
training_documents = documents[:1500]
testing_documents = documents[1500:]

In [0]:
all_words = []
for i in training_documents:
    all_words.extend(i[0])

len(all_words)
#all_words

537222

In [0]:
import nltk
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)            #List of Tuples (word,freq). We only need the words
features = [i[0] for i in common]
features

In [0]:
def get_features_dict(words):
    new_features = {}
    for w in features:
        new_features[w] = w in words        #Boolean
    return new_features

In [0]:
get_features_dict(training_documents[0][0])

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': False,
 'get': False,
 'see': True,
 'go': True,
 'time': True,
 'well': True,
 'scene': True,
 'even': True,
 'good': True,
 'story': True,
 'take': True,
 'would': True,
 'much': True,
 'bad': True,
 'give': True,
 'life': True,
 'also': True,
 'come': True,
 'two': True,
 'way': True,
 'seem': True,
 'know': True,
 'look': False,
 'end': True,
 'first': True,
 '--': True,
 'work': False,
 'thing': False,
 'year': True,
 'plot': True,
 'really': False,
 'play': False,
 'say': False,
 'little': False,
 'people': True,
 'show': False,
 'could': False,
 'love': True,
 'man': True,
 'star': True,
 'great': True,
 'try': False,
 'never': True,
 'best': False,
 'director': True,
 'new': True,
 'performance': False,
 'big': False,
 'actor': False,
 'u': False,
 'many': False,
 'action': True,
 'want': True,
 'watch': True,
 'find': True,
 'role': False,
 'act': False,
 'another': True,
 'think': False,


In [0]:
training_features = [(get_features_dict(doc),category) for doc,category in training_documents]
testing_features = [(get_features_dict(doc),category) for doc,category in testing_documents]

In [0]:
training_features

In [0]:
from nltk import NaiveBayesClassifier

In [0]:
classifier = NaiveBayesClassifier.train(training_features)

In [0]:
#nltk.classify.accuracy(classifier,training_features)
nltk.classify.accuracy(classifier,testing_features)

0.788

In [0]:
classifier.show_most_informative_features(15)

Most Informative Features
               stupidity = True              neg : pos    =     18.0 : 1.0
               ludicrous = True              neg : pos    =     11.6 : 1.0
             outstanding = True              pos : neg    =      9.0 : 1.0
                 idiotic = True              neg : pos    =      8.6 : 1.0
                  prinze = True              neg : pos    =      8.5 : 1.0
                 freddie = True              neg : pos    =      8.5 : 1.0
                 balance = True              pos : neg    =      7.2 : 1.0
            breathtaking = True              pos : neg    =      7.2 : 1.0
                  seagal = True              neg : pos    =      7.1 : 1.0
                 garbage = True              neg : pos    =      7.1 : 1.0
             wonderfully = True              pos : neg    =      7.0 : 1.0
                  castle = True              pos : neg    =      6.9 : 1.0
                    jedi = True              pos : neg    =      6.5 : 1.0

SVM

In [0]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [0]:
svc = SVC()
sklearn_to_nltk_classifier = SklearnClassifier(svc)         #Now input in ( {},cat ) format

In [0]:
sklearn_to_nltk_classifier.train(training_features)



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [0]:
nltk.classify.accuracy(sklearn_to_nltk_classifier,testing_features)

0.798

Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=5,n_estimators=100)

In [0]:
#from nltk.classify.scikitlearn import SklearnClassifier
sklearn_to_nltk_classifier1 = SklearnClassifier(rfc)   

In [0]:
sklearn_to_nltk_classifier1.train(training_features)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [0]:
#nltk.classify.accuracy(sklearn_to_nltk_classifier1,training_features)
nltk.classify.accuracy(sklearn_to_nltk_classifier1,testing_features)

0.81