<pre>
<b>Contents</b>
1. Supervised Classification
    1.1   Gender Identification
    1.2   Choosing The Right Features
    1.3   Document Classification
    1.4   Part-of-Speech Tagging
    1.5   Exploiting Context
    1.6   Sequence Classification
    1.7   Other Methods for Sequence Classification
2. Further Examples of Supervised Classification  
    2.1   Sentence Segmentation
    2.2   Identifying Dialogue Act Types
    2.3   Recognizing Textual Entailment
    2.4   Scaling Up to Large Datasets
3. Evaluation
    3.1   The Test Set
    3.2   Accuracy
    3.3   Precision and Recall
    3.4   Confusion Matrices
    3.5   Cross-Validation
4. Decision Trees    
    4.1   Entropy and Information Gain
5. Naive Bayes Classifiers
    5.1   Underlying Probabilistic Model
    5.2   Zero Counts and Smoothing
    5.3   Non-Binary Features
    5.4   The Naivete of Independence
    5.5   The Cause of Double-Counting
6. Maximum Entropy Classifiers
    6.1   The Maximum Entropy Model
    6.2   Maximizing Entropy
    6.3   Generative vs Conditional Classifiers
7. Modeling Linguistic Patterns
    7.1   What do models tell us?
</pre>

In [29]:
import nltk

## 1.1   Gender Identification

In [30]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [31]:
gender_features('Shrek')

{'last_letter': 'k'}

In [32]:
from nltk.corpus import names

In [33]:
names.words('female.txt')[:5]

['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi']

In [34]:
names.words('male.txt')[:5]

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot']

In [35]:
labeled_names = [(w, 'male') for w in names.words('male.txt')] + [(w, 'female') for w in names.words('female.txt')]

In [36]:
import random

In [37]:
random.shuffle(labeled_names)

In [38]:
labeled_names[:7]

[('Jens', 'male'),
 ('Janot', 'female'),
 ('Sebastien', 'male'),
 ('Talbot', 'male'),
 ('Clinten', 'male'),
 ('Shana', 'female'),
 ('Jeanelle', 'female')]

In [39]:
feutures = [(gender_features(w), g) for (w,g) in labeled_names]

In [40]:
feutures

[({'last_letter': 's'}, 'male'),
 ({'last_letter': 't'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 't'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'v'}, 'male'),
 ({'last_letter': 'y'}, 'male'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'o'}, 'male'),
 ({'last_letter': 'o'}, 'male'),
 ({'last_letter': 'd'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'b'}, 'male'),
 ({'last_letter': 'l'}, 'male'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'e'}, 'male'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 's'}, 'male'),
 ({'last_letter': 'i'}, 'female'),
 ({'last_letter': 'r'}, 'male'),
 ({'last_letter': 't'}, '

In [41]:
train_set, test_set = feutures[500:], feutures[:500]

In [42]:
clf = nltk.NaiveBayesClassifier.train(train_set)

In [44]:
clf.classify(gender_features('Neo'))

'male'

In [45]:
clf.classify(gender_features('Trinity'))

'female'

In [47]:
clf.classify(gender_features('Qnarik'))

'male'

In [48]:
nltk.classify.accuracy(clf, test_set)

0.786

In [49]:
clf.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     32.9 : 1.0
             last_letter = 'k'              male : female =     31.2 : 1.0
             last_letter = 'f'              male : female =     17.6 : 1.0
             last_letter = 'p'              male : female =     10.0 : 1.0
             last_letter = 'v'              male : female =     10.0 : 1.0


## 1.2   Choosing The Right Features

In [53]:
def gender_f2(word):
    features={}
    features['first_letter'] = word[0]
    features['last_letter'] = word[-1]
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = word.lower().count(letter)
        features["has({})".format(letter)] = (letter in word.lower())
    return features    

In [54]:
gender_f2('Neo')

{'first_letter': 'N',
 'last_letter': 'o',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 1,
 'has(e)': True,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 0,
 'has(h)': False,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [55]:
featuresets2 = [(gender_f2(n), gender) for (n, gender) in labeled_names]

In [56]:
train_set2, test_set2 = featuresets2[500:], featuresets2[:500]

In [58]:
clf2 = nltk.NaiveBayesClassifier.train(train_set2)

In [59]:
nltk.classify.accuracy(clf2, test_set2)

0.76

In [60]:
errors = []


In [78]:
for w in labeled_names:
    if clf2.classify(x(test_set2[0][1])) != test_set2[0][1]:
        errors.append((clf2.classify(gender_f2(test_set2[0][1])), test_set2[0][1], test_set2[0]))

SyntaxError: invalid syntax (<ipython-input-78-f7a340d1fa68>, line 3)

In [79]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [80]:
errors = []

In [81]:
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [82]:
errors

[('female', 'male', 'Darryl'),
 ('female', 'male', 'Lynnell'),
 ('female', 'male', 'Caryl'),
 ('male', 'female', 'Normie'),
 ('male', 'female', 'Jule'),
 ('male', 'female', 'Davie'),
 ('male', 'female', 'Tracy'),
 ('male', 'female', 'Lindsey'),
 ('male', 'female', 'Giovanni'),
 ('female', 'male', 'Aryn'),
 ('male', 'female', 'Hilary'),
 ('male', 'female', 'Teddie'),
 ('male', 'female', 'Tye'),
 ('female', 'male', 'Rosabel'),
 ('male', 'female', 'Judith'),
 ('male', 'female', 'Andrey'),
 ('male', 'female', 'Joseph'),
 ('male', 'female', 'Quiggly'),
 ('male', 'female', 'Chance'),
 ('female', 'male', 'Caryn'),
 ('female', 'male', 'Janean'),
 ('female', 'male', 'Kirstyn'),
 ('female', 'male', 'Floris'),
 ('male', 'female', 'Jody'),
 ('male', 'female', 'Henri'),
 ('male', 'female', 'Demetre'),
 ('male', 'female', 'Steve'),
 ('male', 'female', 'Sydney'),
 ('male', 'female', 'Stinky'),
 ('female', 'male', 'Madelin'),
 ('female', 'male', 'Rayshell'),
 ('female', 'male', 'Pegeen'),
 ('male', 'f