In [1]:
import pandas as pd
from tqdm import tqdm
import time
from collections import Counter as ctr

In [2]:
for i in tqdm(range(100)):
    time.sleep(0.2)

100%|██████████████████████████████████████████████████████████████████| 100/100 [00:20<00:00,  4.89it/s]


In [22]:
data = pd.read_csv('train.txt', delimiter=' ', names=['word', 'tag', 'tag2'])
test = data.sample(frac=0.2,random_state=100)
train = data.drop(test.index)

In [23]:
train

Unnamed: 0,word,tag,tag2
0,Confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP
3,pound,NN,I-NP
4,is,VBZ,B-VP
...,...,...,...
211721,them,PRP,B-NP
211722,to,TO,B-PP
211724,Francisco,NNP,I-NP
211725,instead,RB,B-ADVP


In [24]:
tags = list(set(train.tag))
total = len(train)
smoother = float(1e-5)

len(tags), tags

(44,
 ['#',
  '.',
  ':',
  'CC',
  'JJ',
  'VB',
  '$',
  'POS',
  '(',
  'UH',
  'EX',
  'NNP',
  'WP$',
  'VBZ',
  'PRP',
  'VBD',
  'MD',
  'VBN',
  'VBP',
  'JJR',
  'WRB',
  ',',
  ')',
  'RP',
  'NN',
  'RB',
  'JJS',
  'VBG',
  'NNPS',
  "''",
  'FW',
  'PDT',
  'RBS',
  'TO',
  'CD',
  'SYM',
  'WDT',
  'DT',
  'IN',
  '``',
  'WP',
  'PRP$',
  'NNS',
  'RBR'])

### Estimate Transition Probabilities $P(T_i|T_{i-1})$

### Estimate Emission Probabilities $P(T_i|W_{i-1})$

In [25]:
#P(W|T)
word_tag_counts = {}
word_tag_totals = {}
for tag in tags:
    sub_train = train[train.tag == tag]
    word_tag_counts[tag] = ctr(sub_train.word)
    word_tag_totals[tag] = sum(word_tag_counts[tag].values())

def Pwt(W='', T=''):
    if W not in word_tag_counts[T]:
        return smoother
    return word_tag_counts[T][W] / word_tag_totals[T]
    
    
#P(T)
tag_counts = ctr(train.tag)
def Ptag(T=''):
    return tag_counts[T] / total

# P(W)
word_counts = ctr(train.word)
def Pword(W=''):
    if W not in word_counts:
        return smoother
    return word_counts[W] / total

def Ptw(T='', W=''):
    return Pwt(W, T) * Ptag(T) / Pword(W)

In [26]:
Ptag(T='DT')

0.08630787214698138

In [27]:
Pword(W='thebigllama')

1e-05

In [28]:
Pwt("the", "DT")

0.5001710103290239

In [29]:
Ptw("DT", "the")

0.998497883381128

In [30]:
def Pall_tags(word):
    return {t:Ptw(T=t, W=word) for t in tags}

In [31]:
ethe = Pall_tags('the')

In [32]:
max(ethe, key=ethe.get)

'DT'

### Greedy Decoder

In [33]:
example_sentence = ['the', 'dog', 'plays']

In [34]:
for word in example_sentence:
    emission = Pall_tags(word)
    max_tag = max(emission, key=emission.get)
    print(word, max_tag)
    

the DT
dog NN
plays VBZ


In [35]:
guesses = []
for i,row in tqdm(train.iterrows()):
    word=row['word']
    tag = row['tag']
    emission = Pall_tags(word)
    max_tag = max(emission, key=emission.get)
    guesses.append(max_tag)

169382it [00:03, 48216.75it/s]


In [36]:
train['guess'] = guesses

In [37]:
sum(train.guess == train.tag) / total

0.9652560484585139

In [38]:
def Guess(word):
    emission = Pall_tags(word)
    max_tag = max(emission, key=emission.get)
    return max_tag

def Accuracy(df):
    return sum([Guess(row['word']) == row.tag for i, row in tqdm(df.iterrows(), total=len(df))]) / len(df)

Accuracy(train)

100%|█████████████████████████████████████████████████████████| 169382/169382 [00:03<00:00, 45291.61it/s]


0.9652560484585139

In [39]:
Accuracy(test)

100%|███████████████████████████████████████████████████████████| 42345/42345 [00:00<00:00, 45032.06it/s]


0.9204156334868343

In [40]:
train['ptag'] = train['tag'].shift(1)

In [94]:
train

Unnamed: 0,word,tag,tag2,guess,ptag
0,Confidence,NN,B-NP,NN,
1,in,IN,B-PP,IN,NN
2,the,DT,B-NP,DT,IN
3,pound,NN,I-NP,NN,DT
4,is,VBZ,B-VP,VBZ,NN
...,...,...,...,...,...
211722,to,TO,B-PP,TO,PRP
211723,San,NNP,B-NP,NNP,TO
211724,Francisco,NNP,I-NP,NNP,NNP
211725,instead,RB,B-ADVP,RB,NNP
