# Probabilities in NLTK

In [1]:
import collections

In [2]:
class LaplaceSmoothing:
    def __init__(self, samples, smoothing=1):
        self.N = len(samples)
        self.counter = collections.Counter(samples)
        self.M = len(self.counter)
        self.smoothing = smoothing
        
    def prob(self, item):
        return (self.counter[item]+self.smoothing)/ \
                (self.N+self.smoothing*self.M)

### P(win)

In [3]:
pfd = LaplaceSmoothing(['Y','N','Y','Y','Y','N','Y','Y','N','Y'])

In [4]:
pfd.prob('Y')

0.6666666666666666

In [5]:
pfd.prob('N')

0.3333333333333333

### P(win|shine)

In [6]:
pfd = LaplaceSmoothing(['Y','Y','N','Y','Y','Y'])
pfd.prob('Y')

0.75

# The Reuters-21578 Corpus

In [7]:
import nltk
from nltk.corpus import reuters

In [8]:
reuters.categories()

['acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee',
 'copper',
 'copra-cake',
 'corn',
 'cotton',
 'cotton-oil',
 'cpi',
 'cpu',
 'crude',
 'dfl',
 'dlr',
 'dmk',
 'earn',
 'fuel',
 'gas',
 'gnp',
 'gold',
 'grain',
 'groundnut',
 'groundnut-oil',
 'heat',
 'hog',
 'housing',
 'income',
 'instal-debt',
 'interest',
 'ipi',
 'iron-steel',
 'jet',
 'jobs',
 'l-cattle',
 'lead',
 'lei',
 'lin-oil',
 'livestock',
 'lumber',
 'meal-feed',
 'money-fx',
 'money-supply',
 'naphtha',
 'nat-gas',
 'nickel',
 'nkr',
 'nzdlr',
 'oat',
 'oilseed',
 'orange',
 'palladium',
 'palm-oil',
 'palmkernel',
 'pet-chem',
 'platinum',
 'potato',
 'propane',
 'rand',
 'rape-oil',
 'rapeseed',
 'reserves',
 'retail',
 'rice',
 'rubber',
 'rye',
 'ship',
 'silver',
 'sorghum',
 'soy-meal',
 'soy-oil',
 'soybean',
 'strategic-metal',
 'sugar',
 'sun-meal',
 'sun-oil',
 'sunseed',
 'tea',
 'tin',
 'trade',
 'veg-oil',
 'wheat',
 'wpi',
 'yen',
 'zinc']

In [9]:
reuters.fileids(categories='corn')

['test/14832',
 'test/14858',
 'test/15033',
 'test/15043',
 'test/15106',
 'test/15287',
 'test/15341',
 'test/15618',
 'test/15648',
 'test/15676',
 'test/15686',
 'test/15720',
 'test/15845',
 'test/15856',
 'test/15860',
 'test/15863',
 'test/15871',
 'test/15875',
 'test/15877',
 'test/15890',
 'test/15904',
 'test/15906',
 'test/15910',
 'test/15911',
 'test/15917',
 'test/15952',
 'test/15999',
 'test/16012',
 'test/16071',
 'test/16099',
 'test/16147',
 'test/16525',
 'test/16624',
 'test/16751',
 'test/16765',
 'test/17503',
 'test/17509',
 'test/17722',
 'test/18035',
 'test/18482',
 'test/18614',
 'test/18954',
 'test/18973',
 'test/19165',
 'test/19721',
 'test/19821',
 'test/20018',
 'test/20366',
 'test/20637',
 'test/20645',
 'test/20649',
 'test/20723',
 'test/20763',
 'test/21091',
 'test/21243',
 'test/21493',
 'training/10120',
 'training/10139',
 'training/10172',
 'training/10175',
 'training/10319',
 'training/10339',
 'training/10487',
 'training/10489',
 'traini

### Split data

In [10]:
corn_fileids = reuters.fileids(categories='corn')
gold_fileids = reuters.fileids(categories='gold')
grain_fileids = reuters.fileids(categories='grain')

training_fileids = [f for f in reuters.fileids() if f[0:8]=='training']
testing_fileids = [f for f in reuters.fileids() if f[0:4]=='test']

train_fileids_tagged = [(f,'corn') for f in corn_fileids if f[0:8]=='training']
train_fileids_tagged += [(f,'gold') for f in gold_fileids if f[0:8]=='training']
train_fileids_tagged += [(f,'grain') for f in grain_fileids if f[0:8]=='training']


test_fileids_tagged = [(f,'corn') for f in corn_fileids if f[0:4]=='test']
test_fileids_tagged += [(f,'gold') for f in gold_fileids if f[0:4]=='test']
test_fileids_tagged += [(f,'grain') for f in grain_fileids if f[0:4]=='test']


In [11]:
len(grain_fileids)

582

### Extract features

In [12]:
import collections
all_words = collections.Counter(w.lower() \
        for w in reuters.words(fileids=training_fileids))
word_features = [w for (w,c) in all_words.most_common(500)]
word_features[:3]

['.', ',', 'the']

In [13]:
def document_features(fileid):
    document_words = set(reuters.words(fileids=[fileid]))
    features = dict()
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

train_set = [(document_features(f),t) for (f,t) in train_fileids_tagged]
test_set = [(document_features(f),t) for (f,t) in test_fileids_tagged]


### Training

In [14]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(5)

Most Informative Features
          contains(gold) = True             gold : grain  =    278.7 : 1.0
             contains(>) = True             gold : grain  =    129.4 : 1.0
            contains(lt) = True             gold : grain  =     83.1 : 1.0
             contains(&) = True             gold : grain  =     83.1 : 1.0
             contains(;) = True             gold : grain  =     59.4 : 1.0


### Testing

In [15]:
nltk.classify.accuracy(classifier,test_set)

0.6723404255319149

### Macro-averaged Evaluation

In [16]:
def f1(y_true,y_pred,label):
    assert len(y_true) == len(y_pred)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(y_true)):
        if y_true[i] == label:
            if y_pred[i] == label:
                tp += 1
            else:
                fn += 1
        elif y_pred[i] == label:
            fp += 1
        else:
            tn += 1
    try:
        r = tp/(tp+fn)
    except:
        r = 0.0
    try:
        p = tp/(tp+fp)
    except:
        p = 0.0
    try:
        f1 = 2*r*p/(r+p)
    except:
        f1 = 0.0
    return f1

In [17]:
predictions = [classifier.classify(f) for f,l in test_set]
predictions[:10]

['corn',
 'grain',
 'grain',
 'corn',
 'grain',
 'corn',
 'grain',
 'grain',
 'grain',
 'grain']

In [18]:
y_true = [l for f,l in test_set]
y_true[:10]

['corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn',
 'corn']

In [19]:
totalf1 = 0
for label in ('corn','gold','grain'):
    thef1 = f1(y_true,predictions,label)
    print('%s f1: %1.4f' % (label,thef1))
    totalf1 += thef1
print("Macro-average f1: %1.4f" % (totalf1/3))

corn f1: 0.3894
gold f1: 0.8235
grain f1: 0.7516
Macro-average f1: 0.6548


### Micro-averaged Evaluation

In [20]:
def f1(y_true,y_pred):
    assert len(y_true) == len(y_pred)
    labels = list(set(y_true))
    tp, tn, fp, fn = 0, 0, 0, 0
    for label in labels:
        for i in range(len(y_true)):
            if y_true[i] == label:
                if y_pred[i] == label:
                    tp += 1
                else:
                    fn += 1
            elif y_pred[i] == label:
                fp += 1
            else:
                tn += 1
    try:
        r = tp/(tp+fn)
    except:
        r = 0.0
    try:
        p = tp/(tp+fp)
    except:
        p = 0.0
    try:
        f1 = 2*r*p/(r+p)
    except:
        f1 = 0.0
    return f1

In [21]:
print("Micro-average f1: %1.4f" % f1(y_true,predictions))

Micro-average f1: 0.6723


### Evaluation using sklearn

In [22]:
from sklearn.metrics import f1_score

In [23]:
f1_score(y_true, predictions, average='macro')

0.6548479765554206

In [24]:
f1_score(y_true, predictions, average='micro')

0.67234042553191486