In [89]:
# library needed 
from nltk.corpus import names,movie_reviews,brown
import random
import nltk

In [2]:
def gender_features(word):
    return {'last_letter':word[-1]}

In [3]:
gender_features('Sherk')

{'last_letter': 'k'}

In [4]:
names = ([(name,'male') for name in names.words('male.txt')] + 
         [(name,'female') for name in names.words('female.txt')])

In [5]:
random.shuffle(names)

In [6]:
names[:10]

[('Derk', 'male'),
 ('Marthe', 'female'),
 ('Ada', 'female'),
 ('Bessy', 'female'),
 ('Gabriello', 'male'),
 ('Maryellen', 'female'),
 ('Adriana', 'female'),
 ('Bobine', 'female'),
 ('Amabel', 'female'),
 ('Emlyn', 'female')]

In [7]:
featuresets = [(gender_features(n),g) for (n,g) in names]

In [8]:
featuresets

[({'last_letter': 'k'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 'o'}, 'male'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'l'}, 'female'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 's'}, 'female'),
 ({'last_letter': 'h'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'n'}, 'male'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'd'}, 'male'),
 ({'last_letter': 'm'}, 'male'),
 ({'last_letter': 'a'}, 'female'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'y'}, 'female'),
 ({'last_letter': 's'}, 'female'),
 ({'last_letter': 'n'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'i'}, 'female'),
 ({'last_letter': 'e'}, 'female'),
 ({'last_letter': 'l'}, 'female'),
 ({'last_letter': 'i'}, 'female'),
 ({'last_letter': 'e'}, 'male'

In [9]:
len(featuresets)

7944

In [10]:
train_set,test_set = featuresets[500:],featuresets[:500]

In [11]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [12]:
classifier.classify(gender_features('Neo'))

'male'

In [13]:
classifier.classify(gender_features('Ketul'))

'male'

In [14]:
classifier.classify(gender_features('palak'))

'male'

In [15]:
documents = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [16]:
random.shuffle(documents)

In [17]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [18]:
word_features = list(all_words.keys())[:2000]

In [19]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)'%word ] = (word in document_words)
    return features

In [31]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contains(plot)': True,
 'contains(:)': True,
 'contains(two)': True,
 'contains(teen)': False,
 'contains(couples)': False,
 'contains(go)': False,
 'contains(to)': True,
 'contains(a)': True,
 'contains(church)': False,
 'contains(party)': False,
 'contains(,)': True,
 'contains(drink)': False,
 'contains(and)': True,
 'contains(then)': True,
 'contains(drive)': False,
 'contains(.)': True,
 'contains(they)': True,
 'contains(get)': True,
 'contains(into)': True,
 'contains(an)': True,
 'contains(accident)': False,
 'contains(one)': True,
 'contains(of)': True,
 'contains(the)': True,
 'contains(guys)': False,
 'contains(dies)': False,
 'contains(but)': True,
 'contains(his)': True,
 'contains(girlfriend)': True,
 'contains(continues)': False,
 'contains(see)': False,
 'contains(him)': True,
 'contains(in)': True,
 'contains(her)': False,
 'contains(life)': False,
 'contains(has)': True,
 'contains(nightmares)': False,
 'contains(what)': True,
 "contains(')": True,
 'contains(s)': T

In [32]:
featuresets = [(document_features(d),c) for (d,c) in documents]
train_set,test_set = featuresets[100:],featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [26]:
nltk.classify.accuracy(classifier,test_set)

0.87

In [27]:
classifier.show_most_informative_features(5)


Most Informative Features
     contains(stretched) = True              neg : pos    =     10.1 : 1.0
           contains(ugh) = True              neg : pos    =      9.5 : 1.0
 contains(unimaginative) = True              neg : pos    =      8.2 : 1.0
        contains(welles) = True              neg : pos    =      7.5 : 1.0
         contains(groan) = True              neg : pos    =      6.9 : 1.0


In [28]:
from nltk.classify import apply_features

In [29]:
trainset = apply_features(gender_features, names[500:])

In [39]:
testset = apply_features(gender_features,names[:500])
testset[:20]

[({'last_letter': 'k'}, 'male'), ({'last_letter': 'e'}, 'female'), ...]

In [33]:
def gender_features2(name):
    features = {}
    features['first_name'] = name[0].lower()
    features['last_name'] = name[1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features['count(%s)'%letter] = name.lower().count(letter)
        features['has(%s)'%letter] = (letter in name.lower())
    return features

In [38]:
gender_features2('lol hi')

{'first_name': 'l',
 'last_name': 'o',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 1,
 'has(i)': True,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 2,
 'has(l)': True,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 0,
 'has(n)': False,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [40]:
featuresets = [(gender_features2(n),g) for (n,g) in names]
train_set,test_set = featuresets[500:],featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [42]:
nltk.classify.accuracy(classifier,test_set)

0.7

In [43]:
train_names = names[1500:]
devtest_names = names[500:1500]
test_names = names[:500]

In [44]:
train_set = [(gender_features(n),g) for (n,g) in train_names]
devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
test_set  = [(gender_features(n),g) for (n,g) in test_names]

In [45]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [46]:
classifier

<nltk.classify.naivebayes.NaiveBayesClassifier at 0x7f4c0e84f160>

In [47]:
nltk.classify.accuracy(classifier,devtest_set)

0.782

In [48]:
errors = []
for(name,tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag,guess,name))

In [54]:
for tag,guess,name in sorted(errors):
    print(f'name : {name:12} guess ={guess:8} real = {name:8}')

name : Aimil        guess =male     real = Aimil   
name : Anett        guess =male     real = Anett   
name : Angil        guess =male     real = Angil   
name : Ardys        guess =male     real = Ardys   
name : Ashlen       guess =male     real = Ashlen  
name : Austin       guess =male     real = Austin  
name : Ayn          guess =male     real = Ayn     
name : Beatriz      guess =male     real = Beatriz 
name : Bess         guess =male     real = Bess    
name : Bridgett     guess =male     real = Bridgett
name : Brit         guess =male     real = Brit    
name : Britt        guess =male     real = Britt   
name : Caitlin      guess =male     real = Caitlin 
name : Cameo        guess =male     real = Cameo   
name : Charlott     guess =male     real = Charlott
name : Charmain     guess =male     real = Charmain
name : Christean    guess =male     real = Christean
name : Ciel         guess =male     real = Ciel    
name : Coreen       guess =male     real = Coreen  
name : Cybi

In [52]:
len(devtest_set),len(errors)

(1000, 218)

# Document Classification

In [55]:
from nltk.corpus import movie_reviews

In [56]:
documents = [(list(movie_reviews.words(fileid)),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)        
            ]

In [57]:
random.shuffle(documents)

In [65]:
documents[3][:1]

(['all',
  'through',
  'its',
  'production',
  'and',
  'into',
  'the',
  'early',
  'days',
  'of',
  'its',
  'initial',
  ',',
  'aborted',
  'pre',
  '-',
  'release',
  'publicity',
  ',',
  'hard',
  'rain',
  'bore',
  'the',
  'appropriate',
  'moniker',
  'of',
  'the',
  'flood',
  '.',
  'ultimately',
  ',',
  'however',
  ',',
  'paramount',
  'pictures',
  ',',
  'nervous',
  'that',
  'this',
  'movie',
  'would',
  'be',
  'confused',
  'with',
  '1996',
  "'",
  's',
  'other',
  ',',
  'underperforming',
  'disaster',
  'films',
  '(',
  'dante',
  "'",
  's',
  'peak',
  ',',
  'volcano',
  ')',
  ',',
  'changed',
  'the',
  'title',
  'and',
  'shifted',
  'the',
  'release',
  'date',
  'by',
  'nearly',
  'a',
  'year',
  '.',
  'but',
  ',',
  'to',
  'paraphrase',
  'the',
  'bard',
  ',',
  'swill',
  ',',
  'by',
  'any',
  'other',
  'name',
  ',',
  'would',
  'smell',
  'as',
  'rank',
  '.',
  'no',
  'number',
  'of',
  'name',
  'changes',
  'can',
  

In [71]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())


TypeError: unhashable type: 'slice'

In [72]:
all_words

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [70]:
word_features = list(all_words.keys())[:2000]
word_features[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

In [73]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features

In [76]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contains(plot)': True,
 'contains(:)': True,
 'contains(two)': True,
 'contains(teen)': False,
 'contains(couples)': False,
 'contains(go)': False,
 'contains(to)': True,
 'contains(a)': True,
 'contains(church)': False,
 'contains(party)': False,
 'contains(,)': True,
 'contains(drink)': False,
 'contains(and)': True,
 'contains(then)': True,
 'contains(drive)': False,
 'contains(.)': True,
 'contains(they)': True,
 'contains(get)': True,
 'contains(into)': True,
 'contains(an)': True,
 'contains(accident)': False,
 'contains(one)': True,
 'contains(of)': True,
 'contains(the)': True,
 'contains(guys)': False,
 'contains(dies)': False,
 'contains(but)': True,
 'contains(his)': True,
 'contains(girlfriend)': True,
 'contains(continues)': False,
 'contains(see)': False,
 'contains(him)': True,
 'contains(in)': True,
 'contains(her)': False,
 'contains(life)': False,
 'contains(has)': True,
 'contains(nightmares)': False,
 'contains(what)': True,
 "contains(')": True,
 'contains(s)': T

In [86]:
featuresets = [(document_features(d),c) for (d,c) in documents]


train_set,test_set = featuresets[100:],featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [87]:
nltk.classify.accuracy(classifier,test_set)

0.77

In [88]:
classifier.show_most_informative_features(5)

Most Informative Features
     contains(atrocious) = True              neg : pos    =     11.8 : 1.0
 contains(unimaginative) = True              neg : pos    =      8.4 : 1.0
    contains(schumacher) = True              neg : pos    =      7.5 : 1.0
        contains(turkey) = True              neg : pos    =      6.9 : 1.0
       contains(singers) = True              pos : neg    =      6.3 : 1.0


# Part-of-Speech Tagging

In [96]:
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1 

In [100]:
common_suffixes = list(suffix_fdist.keys())[:100]

In [101]:
common_suffixes

['e',
 'he',
 'the',
 'n',
 'on',
 'ton',
 'y',
 'ty',
 'nty',
 'd',
 'nd',
 'and',
 'ry',
 'ury',
 'id',
 'aid',
 'ay',
 'day',
 'an',
 'ion',
 'f',
 'of',
 's',
 "'s",
 "a's",
 't',
 'nt',
 'ent',
 'ary',
 'ed',
 'ced',
 '`',
 '``',
 'o',
 'no',
 'ce',
 'nce',
 "'",
 "''",
 'at',
 'hat',
 'ny',
 'any',
 'es',
 'ies',
 'k',
 'ok',
 'ook',
 'ace',
 '.',
 'r',
 'er',
 'her',
 'in',
 'end',
 'ts',
 'nts',
 'ity',
 've',
 'ive',
 'ee',
 'tee',
 ',',
 'h',
 'ch',
 'ich',
 'ad',
 'had',
 'l',
 'll',
 'all',
 'ge',
 'rge',
 'ves',
 'se',
 'ise',
 'ks',
 'nks',
 'a',
 'ta',
 'nta',
 'or',
 'for',
 'ner',
 'as',
 'was',
 'ted',
 'ber',
 'm',
 'rm',
 'erm',
 'en',
 'een',
 'ged',
 'by',
 'ior',
 'rt',
 'urt',
 'dge',
 'od']

In [102]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)'%suffix] = word.lower().endswith(suffix)
    return features

In [106]:
pos_features('adf')

{'endswith(e)': False,
 'endswith(he)': False,
 'endswith(the)': False,
 'endswith(n)': False,
 'endswith(on)': False,
 'endswith(ton)': False,
 'endswith(y)': False,
 'endswith(ty)': False,
 'endswith(nty)': False,
 'endswith(d)': False,
 'endswith(nd)': False,
 'endswith(and)': False,
 'endswith(ry)': False,
 'endswith(ury)': False,
 'endswith(id)': False,
 'endswith(aid)': False,
 'endswith(ay)': False,
 'endswith(day)': False,
 'endswith(an)': False,
 'endswith(ion)': False,
 'endswith(f)': True,
 'endswith(of)': False,
 'endswith(s)': False,
 "endswith('s)": False,
 "endswith(a's)": False,
 'endswith(t)': False,
 'endswith(nt)': False,
 'endswith(ent)': False,
 'endswith(ary)': False,
 'endswith(ed)': False,
 'endswith(ced)': False,
 'endswith(`)': False,
 'endswith(``)': False,
 'endswith(o)': False,
 'endswith(no)': False,
 'endswith(ce)': False,
 'endswith(nce)': False,
 "endswith(')": False,
 "endswith('')": False,
 'endswith(at)': False,
 'endswith(hat)': False,
 'endswith(ny

In [104]:
tagged_words = brown.tagged_words(categories='news')

In [113]:
featuresets = [(pos_features(n),g) for(n,g) in tagged_words]
featuresets[:2]

[({'endswith(e)': True,
   'endswith(he)': True,
   'endswith(the)': True,
   'endswith(n)': False,
   'endswith(on)': False,
   'endswith(ton)': False,
   'endswith(y)': False,
   'endswith(ty)': False,
   'endswith(nty)': False,
   'endswith(d)': False,
   'endswith(nd)': False,
   'endswith(and)': False,
   'endswith(ry)': False,
   'endswith(ury)': False,
   'endswith(id)': False,
   'endswith(aid)': False,
   'endswith(ay)': False,
   'endswith(day)': False,
   'endswith(an)': False,
   'endswith(ion)': False,
   'endswith(f)': False,
   'endswith(of)': False,
   'endswith(s)': False,
   "endswith('s)": False,
   "endswith(a's)": False,
   'endswith(t)': False,
   'endswith(nt)': False,
   'endswith(ent)': False,
   'endswith(ary)': False,
   'endswith(ed)': False,
   'endswith(ced)': False,
   'endswith(`)': False,
   'endswith(``)': False,
   'endswith(o)': False,
   'endswith(no)': False,
   'endswith(ce)': False,
   'endswith(nce)': False,
   "endswith(')": False,
   "endswith

In [129]:
size = int(len(featuresets) * 0.1)
train_set , test_set = featuresets[size:],featuresets[:size]

In [130]:
classifier = nltk.DecisionTreeClassifier.train(train_set)

KeyboardInterrupt: 

In [None]:
nltk.classify.accuracy(classifier,test_set)

In [None]:
print(classifier.pseudocode(depth=4))

In [None]:
def pos_features(sentence,i):
    features = {'suffix(1)':sentence[i][-1:],
                'suffix(2)':sentence[i][-2:],
                'suffix(3)':sentence[i][-3:]}
    
    if i == 0:
        features['prev-word'] = '<START>'
    
    else:
        features['prev-word'] = sentence[i-1]
    
    return features

In [None]:
pos_features(brown.sents()[0],8)

In [None]:
tagged_sents = brown.tagged_sents(categories='news')

In [127]:
featuresets = []

In [128]:
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i,(word,tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent,i),tag))

In [131]:
size = int(len(featuresets) * 0.1)
train_set,test_set = featuresets[:size],featuresets[size:]

In [132]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [133]:
nltk.classify.accuracy(classifier,test_set)

0.699477342291075

In [134]:
def pos_features(sentence,i,history):
    features = {'suffix(1)':sentence[i][-1:],
                'suffix(2)':sentence[i][-2:],
                'suffix(3)':sentence[i][-3:]}
    
    if i == 0:
        features['prev-word'] = '<START>'
        features['prev-tag'] = '<START>'
    else:
        features['prev-word'] = sentence[i-1]
        features['prev-tag'] = history[i-1]
    
    return features

In [137]:
class ConsecutivePosTagger(nltk.TaggerI):
    
    def __init__(self,train_sents):
        train_set = []
        
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i,(word,tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent,i,history)
                train_set.append((featureset,tag))
                history.append(tag)
        
        self.classifier  = nltk.NaiveBayesClassifier.train(train_set)
        
    def tag(self,sentence):
        history = []
        for i,word in enumerate(sentence):
            featureset = pos_features(sentence,i,history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence,history)

In [138]:
help(nltk.tag.untag)

Help on function untag in module nltk.tag.util:

untag(tagged_sentence)
    Given a tagged sentence, return an untagged version of that
    sentence.  I.e., return a list containing the first element
    of each tuple in *tagged_sentence*.
    
        >>> from nltk.tag.util import untag
        >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')])
        ['John', 'saw', 'Mary']



In [139]:
tagged_sents = brown.tagged_sents(categories= 'news')
size = int(len(tagged_sents) * 0.1)

In [140]:
train_sents,test_sents = tagged_sents[size:],tagged_sents[:size]

In [141]:
tagger = ConsecutivePosTagger(train_sents)

In [142]:
tagger.evaluate(test_sents)

0.7980528511821975

# Other Methods for Sequence Classification

In [178]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundries = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundries.add(offset-1)

In [179]:
list(boundries)[1:7]

[90116, 16389, 40968, 81929, 24587, 16396]

In [180]:
tokens

['.',
 'START',
 'Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov',
 '.',
 '29',
 '.',
 'Mr',
 '.',
 'Vinken',
 'is',
 'chairman',
 'of',
 'Elsevier',
 'N',
 '.',
 'V',
 '.,',
 'the',
 'Dutch',
 'publishing',
 'group',
 '.',
 '.',
 'START',
 'Rudolph',
 'Agnew',
 ',',
 '55',
 'years',
 'old',
 'and',
 'former',
 'chairman',
 'of',
 'Consolidated',
 'Gold',
 'Fields',
 'PLC',
 ',',
 'was',
 'named',
 'a',
 'nonexecutive',
 'director',
 'of',
 'this',
 'British',
 'industrial',
 'conglomerate',
 '.',
 '.',
 'START',
 'A',
 'form',
 'of',
 'asbestos',
 'once',
 'used',
 'to',
 'make',
 'Kent',
 'cigarette',
 'filters',
 'has',
 'caused',
 'a',
 'high',
 'percentage',
 'of',
 'cancer',
 'deaths',
 'among',
 'a',
 'group',
 'of',
 'workers',
 'exposed',
 'to',
 'it',
 'more',
 'than',
 '30',
 'years',
 'ago',
 ',',
 'researchers',
 'reported',
 '.',
 'The',
 'asbestos',
 'fiber',
 ',',
 'crocidolit

In [181]:
sents

[['.', 'START'], ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.'], ...]

In [182]:
a = [1324,24]


In [183]:
a.extend([23])
a.append(555)
a

[1324, 24, 23, 555]

In [184]:
def punct_features(tokens,i):
    return {
        'next-word-capitalized':tokens[i+1][0].isupper(),
        'prevword':tokens[i-1].lower(),
        'punct':tokens[i],
        'prev-word-is-one-char':len(tokens[i-1]) == 1        
    }

In [186]:
featuresets = [(punct_features(tokens,i),(i in boundries)) 
               for i in range(1,len(tokens)-1) 
               if tokens[i] in '.?!']

In [209]:
def train_test(featuresets):
    size = int(len(featuresets) * 0.1)
    return featuresets[size:],featuresets[:size]
train_set,test_set = train_test(featuresets)
classifier  = nltk.NaiveBayesClassifier.train(train_set)

In [207]:
nltk.classify.accuracy(classifier,test_set)

0.668

In [208]:
def segment_sentences(words):
    start = 0
    sents = []
    for i,word in words:
        if word in '.?!' and classifier.classify(words,i) == True:
            sents.append(words[start:i+1])
            start = i+1
    
    if start <len(words) :
        sents.append(words[start:])

# Idetentifying Dialogue Act Types

In [204]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]
posts

[<Element 'Post' at 0x7f4c1ede7cc8>, <Element 'Post' at 0x7f4c1133e098>, ...]

In [203]:
posts[78].text

"and i don't complain about things being hard very often."

In [198]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains(%s)'%word.lower()] =  True
    return features

In [199]:
featuresets = [(dialogue_act_features(post.text),post.get('class'))
               for post in posts]

In [210]:
train_set,test_set = train_test(featuresets)

In [211]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [212]:
nltk.classify.accuracy(classifier,test_set)

0.668

In [234]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hype_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [216]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]

In [219]:
rtepair.text

'Parviz Davudi was representing Iran at a meeting of the Shanghai Co-operation Organisation (SCO), the fledgling association that binds Russia, China and four former Soviet republics of central Asia together to fight terrorism.'

In [220]:
rtepair

<RTEPair: gid=3-34>

In [221]:
rtepair.hyp

'China is a member of SCO.'

In [230]:
extractor = nltk.RTEFeatureExtractor(rtepair)

In [231]:
extractor.text_words

{'Asia',
 'China',
 'Co',
 'Davudi',
 'Iran',
 'Organisation',
 'Parviz',
 'Russia',
 'SCO',
 'Shanghai',
 'Soviet',
 'association',
 'at',
 'binds',
 'central',
 'fight',
 'fledgling',
 'former',
 'four',
 'meeting',
 'operation',
 'representing',
 'republics',
 'terrorism.',
 'that',
 'together',
 'was'}

In [233]:
extractor.hyp_words

{'China', 'SCO.', 'member'}

In [223]:
nltk.RTEFeatureExtractor(rtepair).overlap('word')

set()

In [227]:
nltk.RTEFeatureExtractor(rtepair).hyp_extra('word')

{'member'}

In [236]:
nltk.RTEFeatureExtractor(rtepair).overlap('ne')

{'China'}

In [229]:
nltk.RTEFeatureExtractor(rtepair).hyp_extra('ne')

{'SCO.'}

In [237]:
help(extractor.overlap)

Help on method overlap in module nltk.classify.rte_classify:

overlap(toktype, debug=False) method of nltk.classify.rte_classify.RTEFeatureExtractor instance
    Compute the overlap between text and hypothesis.
    
    :param toktype: distinguish Named Entities from ordinary words
    :type toktype: 'ne' or 'word'



In [238]:
help(extractor.hyp_extra)

Help on method hyp_extra in module nltk.classify.rte_classify:

hyp_extra(toktype, debug=True) method of nltk.classify.rte_classify.RTEFeatureExtractor instance
    Compute the extraneous material in the hypothesis.
    
    :param toktype: distinguish Named Entities from ordinary words
    :type toktype: 'ne' or 'word'



# 6.3 Evaluation

In [240]:
tagged_sents = list(brown.tagged_sents(categories = 'news'))

In [241]:
random.shuffle(tagged_sents)

In [255]:
train_set,test_set = train_test(tagged_sents)


In [244]:
file_ids = brown.fileids(categories='news')

In [245]:
train_set,test_set = train_test(file_ids)

In [257]:
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')

 # Confusion Matrix

In [260]:
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word,tag) in sent]

def apply_tagger(tagger,corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

In [266]:
gold = tag_list(brown.tagged_sents(categories = 'editorial'))
# test = tag_list(apply_tagger(t2,brown.tagged_sents(categories='editorial')))
# cm = nltk.confusionmatrix(gold,test)