# Movie Reviews

In [1]:
import nltk
nltk.download("movie_reviews")
from nltk.corpus import movie_reviews
movie_reviews.categories()

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


['neg', 'pos']

In [2]:
print("Number of negative reviews:", len(movie_reviews.fileids('neg')))
print("Number of positive reviews:", len(movie_reviews.fileids('pos')))

Number of negative reviews: 1000
Number of positive reviews: 1000


The following code partitions the movie review corpus into a training and a test set.

In [3]:
import random
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.seed(1234)
random.shuffle(documents)
threshold1 = int(len(documents)*.6)
threshold2 = int(len(documents)*.8)
train = documents[:threshold1]
devtest = documents[threshold1:threshold2]
test = documents[threshold2:]

The following code finds the 2000 most frequent non-stop words.

In [4]:
import collections
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')
c = collections.Counter([w.lower() for (words,category) in train 
                                   for w in words if w.lower() not in stop])
top2000words = [w for (w,count) in c.most_common(2000)]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Using NLTK

The following code implements one-hot encoding with the 2000 most frequent words.

In [5]:
def document_features(words):
    "Return the document features for an NLTK classifier"
    words_lower = [w.lower() for w in words]
    result = dict()
    for w in top2000words:
        result['has(%s)' % w] = (w in words_lower)
    return result

And here we train an NLTK Naive Bayes classifier using the training set, and evaluate the system using the devtest set.

In [6]:
train_features = [(document_features(x), y) for (x, y) in train]
devtest_features = [(document_features(x), y) for (x, y) in devtest]
classifier = nltk.NaiveBayesClassifier.train(train_features)

In [7]:
nltk.classify.accuracy(classifier, devtest_features)

0.7775

In [8]:
nltk.classify.accuracy(classifier, train_features)

0.8816666666666667

We can see the difference in accuracy between the test set and the train set.

## Using Scikit-learn

The following code defines a second feature extractor that uses one-hot encoding on the same list of 2000 words, and which is suitable for sklearn.

In [9]:
def vector_features(words):
    "Return a vector of features for sklearn"
    words_lower = [w.lower() for w in words]
    result = []
    for w in top2000words:
        if w in words_lower:
            result.append(1)
        else:
            result.append(0)
    return result

Below is the code that generates the vectors, trains a Multinomial Naive Bayes classifier, and evaluates the result. 

In [10]:
train_vectors = [vector_features(x) for (x, y) in train]
train_labels = [y for (x, y) in train]
devtest_vectors = [vector_features(x) for (x, y) in devtest]
devtest_labels = [y for (x, y) in devtest]

In [11]:
from sklearn.naive_bayes import MultinomialNB
sklearn_classifier = MultinomialNB()
sklearn_classifier.fit(train_vectors, [y for (x, y) in train])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
predictions = sklearn_classifier.predict(devtest_vectors)
accuracy_score(devtest_labels, predictions)

0.83999999999999997

In [14]:
predictions = sklearn_classifier.predict(train_vectors)
accuracy_score(train_labels, predictions)

0.92000000000000004

And below is the code that uses Support Vector Machines (SVM) instead. You can see that the interface is the same. SVMs typically give very good results, especially when the amount of training data is large enough (in this case it wasn't).

In [15]:
from sklearn.svm import SVC
sklearn_classifier2 = SVC()
sklearn_classifier2.fit(train_vectors, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
predictions2 = sklearn_classifier2.predict(devtest_vectors)
accuracy_score(devtest_labels, predictions2)

0.77000000000000002

In [17]:
predictions2 = sklearn_classifier2.predict(train_vectors)
accuracy_score(train_labels, predictions2)

0.84333333333333338

## 10-fold Cross Validation using sklearn

In [18]:
from sklearn.model_selection import cross_val_score
crossval_classifier = SVC()
dev_vectors = train_vectors + devtest_vectors
dev_labels = train_labels + devtest_labels
scores = cross_val_score(crossval_classifier, dev_vectors, dev_labels, cv=10, scoring="accuracy")
scores

array([ 0.8136646 ,  0.75776398,  0.8447205 ,  0.83125   ,  0.825     ,
        0.825     ,  0.825     ,  0.81761006,  0.77987421,  0.79874214])

In [19]:
print("Mean of accuracy:", scores.mean())
print("Standard deviation of accuracy:", scores.std())

Mean of accuracy: 0.811862548342
Standard deviation of accuracy: 0.0247621315653


In [20]:
import numpy as np
from sklearn.model_selection import KFold
dev_array_vectors = np.array(dev_vectors)
dev_array_labels = np.array(dev_labels)
fold = 0
kf = KFold(n_splits=10, shuffle=True, random_state=1234)
for kv_train, kv_test in kf.split(dev_vectors):
    # kv_train and kv_test are indices of array dev_vectors
    print("Fold %i:" % fold)
    fold += 1
    cv_classifier = SVC()
    cv_classifier.fit(dev_array_vectors[kv_train], dev_array_labels[kv_train])
    test_predictions = cv_classifier.predict(dev_array_vectors[kv_test])
    test_accuracy = accuracy_score(dev_array_labels[kv_test], test_predictions)
    print("Accuracy: %.3f" % test_accuracy)

Fold 0:
Accuracy: 0.775
Fold 1:
Accuracy: 0.812
Fold 2:
Accuracy: 0.800
Fold 3:
Accuracy: 0.806
Fold 4:
Accuracy: 0.831
Fold 5:
Accuracy: 0.812
Fold 6:
Accuracy: 0.819
Fold 7:
Accuracy: 0.825
Fold 8:
Accuracy: 0.812
Fold 9:
Accuracy: 0.838


# Sentence Segmentation

The following code splits the Brown corpus into a training and test set. Note that now we cannot shuffle the sentences since we will need information from text from previous and following sentences.

In [21]:
nltk.download("brown")
from nltk.corpus import brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [22]:
sents = brown.sents(categories='news')
size = int(len(sents)*0.1)
train_sents, test_sents = sents[size:], sents[:size]

The following code extracts the boundary information of tokenised sentences. This can be used for our annotations.

In [23]:
def extract_boundaries(sents):
    """Return the tokens and the sentence boundary positions"""
    tokens = []
    boundaries = []
    offset = 0
    for sent in sents:
        tokens.extend(sent)
        offset += len(sent)
        boundaries.append(offset-1)
    return tokens, boundaries

In [24]:
train_tokens, train_boundaries = extract_boundaries(train_sents)
test_tokens, test_boundaries = extract_boundaries(test_sents)

In [25]:
train_tokens[:50]

['He',
 'assured',
 'Mr.',
 'Martinelli',
 'and',
 'the',
 'council',
 'that',
 'he',
 'would',
 'study',
 'the',
 'correct',
 'method',
 'and',
 'report',
 'back',
 'to',
 'the',
 'council',
 'as',
 'soon',
 'as',
 'possible',
 '.',
 'Mr.',
 'Martinelli',
 'said',
 'yesterday',
 'that',
 'the',
 'Citizens',
 'Group',
 'of',
 'Johnston',
 'will',
 'meet',
 'again',
 'July',
 '24',
 'to',
 'plan',
 'further',
 'strategy',
 'in',
 'the',
 'charter',
 'movement',
 '.',
 'He']

In [26]:
train_boundaries[:10]

[24, 48, 77, 96, 115, 149, 181, 202, 239, 252]

In [27]:
train_tokens[21:26]

['soon', 'as', 'possible', '.', 'Mr.']

We now define context-based features for all tokens that are candidates to sentence endings.

In [28]:
def segmenter_features(tokens, i):
    """Return the features of token[i]"""
    return {'next-word-capitalized': 
                  tokens[i+1][0].isupper(),
            'prev-word': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': 
                  len(tokens[i-1]) == 1}   

With the tokens, boundaries, and feature extractor, we can prepare the training and test sets.

In [29]:
candidates = '.?!'
train_features = [(segmenter_features(train_tokens, i), 
                   (i in train_boundaries))
                 for i in range(1, len(train_tokens)-1)
                 if train_tokens[i] in candidates]
test_features = [(segmenter_features(test_tokens, i), 
                  (i in test_boundaries))
                 for i in range(1, len(test_tokens)-1)
                 if test_tokens[i] in candidates]


In [30]:
train_features[:3]

[({'next-word-capitalized': True,
   'prev-word': 'possible',
   'prev-word-is-one-char': False,
   'punct': '.'},
  True),
 ({'next-word-capitalized': True,
   'prev-word': 'movement',
   'prev-word-is-one-char': False,
   'punct': '.'},
  True),
 ({'next-word-capitalized': False,
   'prev-word': 'comes',
   'prev-word-is-one-char': False,
   'punct': '.'},
  True)]

In [31]:
len(train_features), len(test_features)

(3749, 407)

Now we can train a classifier that can be used for sentence segmentation

In [32]:
segmenter = nltk.NaiveBayesClassifier.train(train_features)
nltk.classify.accuracy(segmenter, test_features)

1.0

Looks impressive! but let's check what would happen if we introduced a majority baseline classifier.

In [33]:
from collections import Counter
train_counter = Counter([f[1] for f in train_features])
train_counter

Counter({False: 62, True: 3687})

Since most training samples are labelled as `True,` the majority baseline is a classifier that always outputs `True`. In that case, accuracy in the test set is:

In [34]:
test_counter = Counter([f[1] for f in test_features])
test_counter

Counter({False: 2, True: 405})

In [35]:
405/407

0.995085995085995

So, the majority baseline was not that impressive after all. The finished segmenter that uses the trained classifier is:

In [36]:
def segment_sentences(tokens):
    """Segment a list of tokens"""
    start = 0
    sents = []
    for i, token in enumerate(tokens):
        if token in candidates and \
           segmenter.classify(segmenter_features(tokens, i)) == True:
               sents.append(tokens[start:i+1])
               start = i+1
    if start < len(tokens):
        sents.append(tokens[start:])
    return sents

In [37]:
segment_sentences(["This", "is", "a", "sentence", ".", "This", 
                    "is", "another", "one"])

[['This', 'is', 'a', 'sentence', '.'], ['This', 'is', 'another', 'one']]

# The Reuters-21578 Corpus

In [38]:
import nltk
nltk.download("reuters")
from nltk.corpus import reuters

[nltk_data] Downloading package reuters to /root/nltk_data...


In [39]:
reuters.categories()

['acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee',
 'copper',
 'copra-cake',
 'corn',
 'cotton',
 'cotton-oil',
 'cpi',
 'cpu',
 'crude',
 'dfl',
 'dlr',
 'dmk',
 'earn',
 'fuel',
 'gas',
 'gnp',
 'gold',
 'grain',
 'groundnut',
 'groundnut-oil',
 'heat',
 'hog',
 'housing',
 'income',
 'instal-debt',
 'interest',
 'ipi',
 'iron-steel',
 'jet',
 'jobs',
 'l-cattle',
 'lead',
 'lei',
 'lin-oil',
 'livestock',
 'lumber',
 'meal-feed',
 'money-fx',
 'money-supply',
 'naphtha',
 'nat-gas',
 'nickel',
 'nkr',
 'nzdlr',
 'oat',
 'oilseed',
 'orange',
 'palladium',
 'palm-oil',
 'palmkernel',
 'pet-chem',
 'platinum',
 'potato',
 'propane',
 'rand',
 'rape-oil',
 'rapeseed',
 'reserves',
 'retail',
 'rice',
 'rubber',
 'rye',
 'ship',
 'silver',
 'sorghum',
 'soy-meal',
 'soy-oil',
 'soybean',
 'strategic-metal',
 'sugar',
 'sun-meal',
 'sun-oil',
 'sunseed',
 'tea',
 'tin',
 'trade',
 'veg-oil',
 'wheat',
 'wpi',
 'yen',
 'zinc']

In [40]:
reuters.fileids(categories='corn')

['test/14832',
 'test/14858',
 'test/15033',
 'test/15043',
 'test/15106',
 'test/15287',
 'test/15341',
 'test/15618',
 'test/15648',
 'test/15676',
 'test/15686',
 'test/15720',
 'test/15845',
 'test/15856',
 'test/15860',
 'test/15863',
 'test/15871',
 'test/15875',
 'test/15877',
 'test/15890',
 'test/15904',
 'test/15906',
 'test/15910',
 'test/15911',
 'test/15917',
 'test/15952',
 'test/15999',
 'test/16012',
 'test/16071',
 'test/16099',
 'test/16147',
 'test/16525',
 'test/16624',
 'test/16751',
 'test/16765',
 'test/17503',
 'test/17509',
 'test/17722',
 'test/18035',
 'test/18482',
 'test/18614',
 'test/18954',
 'test/18973',
 'test/19165',
 'test/19721',
 'test/19821',
 'test/20018',
 'test/20366',
 'test/20637',
 'test/20645',
 'test/20649',
 'test/20723',
 'test/20763',
 'test/21091',
 'test/21243',
 'test/21493',
 'training/10120',
 'training/10139',
 'training/10172',
 'training/10175',
 'training/10319',
 'training/10339',
 'training/10487',
 'training/10489',
 'traini

### Split data

In [41]:
corn_fileids = reuters.fileids(categories='corn')
gold_fileids = reuters.fileids(categories='gold')
grain_fileids = reuters.fileids(categories='grain')

training_fileids = [f for f in reuters.fileids() if f[0:8]=='training']
testing_fileids = [f for f in reuters.fileids() if f[0:4]=='test']

train_fileids_tagged = [(f,'corn') for f in corn_fileids if f[0:8]=='training']
train_fileids_tagged += [(f,'gold') for f in gold_fileids if f[0:8]=='training']
train_fileids_tagged += [(f,'grain') for f in grain_fileids if f[0:8]=='training']


test_fileids_tagged = [(f,'corn') for f in corn_fileids if f[0:4]=='test']
test_fileids_tagged += [(f,'gold') for f in gold_fileids if f[0:4]=='test']
test_fileids_tagged += [(f,'grain') for f in grain_fileids if f[0:4]=='test']


In [42]:
len(grain_fileids)

582

### Extract features

In [43]:
import collections
all_words = collections.Counter(w.lower() \
        for w in reuters.words(fileids=training_fileids))
word_features = [w for (w, c) in all_words.most_common(500)]
word_features[:3]

['.', ',', 'the']

In [44]:
def document_features(fileid):
    document_words = set(reuters.words(fileids=[fileid]))
    features = dict()
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

train_set = [(document_features(f),t) for (f,t) in train_fileids_tagged]
test_set = [(document_features(f),t) for (f,t) in test_fileids_tagged]


### Training

In [45]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(5)

Most Informative Features
          contains(gold) = True             gold : grain  =    278.7 : 1.0
             contains(>) = True             gold : grain  =    129.4 : 1.0
             contains(&) = True             gold : grain  =     83.1 : 1.0
            contains(lt) = True             gold : grain  =     83.1 : 1.0
             contains(;) = True             gold : grain  =     59.4 : 1.0


### Testing

In [46]:
nltk.classify.accuracy(classifier,test_set)

0.6723404255319149

### Macro-averaged Evaluation

In [47]:
def f1(y_true,y_pred,label):
    assert len(y_true) == len(y_pred)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(y_true)):
        if y_true[i] == label:
            if y_pred[i] == label:
                tp += 1
            else:
                fn += 1
        elif y_pred[i] == label:
            fp += 1
        else:
            tn += 1
    try:
        r = tp/(tp+fn)
    except:
        r = 0.0
    try:
        p = tp/(tp+fp)
    except:
        p = 0.0
    try:
        f1 = 2*r*p/(r+p)
    except:
        f1 = 0.0
    return f1

In [48]:
predictions = [classifier.classify(f) for f, l in test_set]
predictions[:10]

['corn',
 'grain',
 'grain',
 'corn',
 'grain',
 'corn',
 'grain',
 'grain',
 'grain',
 'grain']

In [49]:
y_true = [l for f, l in test_set]
y_true[:10]

['corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn']

In [50]:
totalf1 = 0
for label in ('corn','gold','grain'):
    thef1 = f1(y_true,predictions,label)
    print('%s f1: %1.4f' % (label,thef1))
    totalf1 += thef1
print("Macro-average f1: %1.4f" % (totalf1/3))

corn f1: 0.3894
gold f1: 0.8235
grain f1: 0.7516
Macro-average f1: 0.6548


### Micro-averaged Evaluation

In [51]:
def f1_micro(y_true,y_pred):
    assert len(y_true) == len(y_pred)
    labels = list(set(y_true))
    tp, tn, fp, fn = 0, 0, 0, 0
    for label in labels:
        for i in range(len(y_true)):
            if y_true[i] == label:
                if y_pred[i] == label:
                    tp += 1
                else:
                    fn += 1
            elif y_pred[i] == label:
                fp += 1
            else:
                tn += 1
    try:
        r = tp/(tp+fn)
    except:
        r = 0.0
    try:
        p = tp/(tp+fp)
    except:
        p = 0.0
    try:
        f1 = 2*r*p/(r+p)
    except:
        f1 = 0.0
    return f1

In [52]:
print("Micro-average f1: %1.4f" % f1_micro(y_true,predictions))

Micro-average f1: 0.6723


### Evaluation using sklearn

In [53]:
from sklearn.metrics import f1_score

In [54]:
f1_score(y_true, predictions, average='macro')

0.6548479765554206

In [55]:
f1_score(y_true, predictions, average='micro')

0.67234042553191486