In [3]:
import pandas as pd
from pathlib import Path
from nltk.corpus import stopwords
from hmmlearn.hmm import MultinomialHMM
import nltk
import string
import re
import seaborn as sns
import gc
import numpy as np
from scipy.sparse import csr_matrix

from seqlearn.evaluation import bio_f_score
from seqlearn.hmm import MultinomialHMM
from seqlearn.evaluation import SequenceKFold

import epam_nlp as hw

sns.set_context('talk', rc={'figure.figsize': (22, 18)})
sns.set_style('darkgrid')

%matplotlib inline 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
DATA = Path('../data')
PROCESSED = DATA / 'processed_voa.tsv'
PUNCTUATION = list(string.punctuation)
TARGET = 'iob_ner'

In [5]:
def get_X_y_lengths(df: pd.DataFrame, cols_to_keep=None, sequence_column='seq', target=TARGET, one_hot=False):
    if isinstance(sequence_column, str):
        sequence_column = [sequence_column]
    if cols_to_keep is None:
        cols_to_keep = {}
        
    y = df[target].cat.codes.values.copy()
    lengths = df.groupby(sequence_column, sort=False).count().iloc[:, 0].values
    if target in cols_to_keep:
        cols_to_keep.remove(target)
    cols_to_drop = set(df.columns) - cols_to_keep
    X = df.drop(cols_to_drop, axis=1)
    if one_hot:
        X = pd.get_dummies(X, dtype=np.bool, sparse=True)
    else:
        X = X.values if X.shape[1] > 1 else np.ravel(X.values)

    return X, y, lengths

In [6]:
def get_cv(X: np.ndarray, y: np.ndarray, lengths=None, n_folds=5):
    if lengths is None:
        lengths = X.shape[0]
    kf = SequenceKFold(lengths=lengths, n_folds=n_folds)
    for (train_ind, train_lengths, test_ind, test_lengths) in kf:
        yield X[train_ind], train_lengths, y[train_ind], X[test_ind], test_lengths, y[test_ind]

# Basic processing

In [None]:
df = pd.read_csv(PROCESSED, sep='\t')
df.sample(5)

In [None]:
df_without_punctuation = df[~df.token.isin(PUNCTUATION)].copy()

In [None]:
stopwords_index = ~df_without_punctuation.lemma.str.lower().isin(stopwords.words('english'))
df_without_stopwords = df_without_punctuation[stopwords_index].copy()
len(df_without_stopwords)

In [None]:
df_without_stopwords['shape'] = df.token.map(word_shape)

In [None]:

df_without_stopwords['stem'] = df_without_stopwords.lemma.map(lambda w: stem(w, stemmer))

In [None]:
df_without_stopwords.to_csv(DATA / 'basic_processing.csv', header=True, index=False)

# EDA

In [None]:
g = sns.catplot(x='iob_ner', data=df_without_stopwords, kind='count', aspect=1.5, color='b')
g.set_xticklabels(rotation=45);

In [None]:
g = sns.catplot(x='iob_ner', data=df_without_stopwords[df_without_stopwords.iob_ner != 'O'], 
                kind='count', aspect=1.5, color='b')
g.set_xticklabels(rotation=45);

# Baseline models

## HMM

In [7]:
train = pd.read_csv(DATA / 'basic_processing.csv')
train.sample()

Unnamed: 0,token,pos,lemma,iob_ner,part,document,sentence,shape,stem
710617,Iran,NNP,iran,B-geo,p98,d0119,3,Xxxx,iran


In [7]:
train['seq'] = train.groupby(['document', 'part']).grouper.group_info[0]
train = train.astype('category')

In [8]:
print(train.stem.unique().shape[0])
print(train.token.unique().shape[0])
print(train.lemma.unique().shape[0])

23879
34898
27073


In [20]:
X, y, lengths = get_X_y_lengths(train, cols_to_keep={'token'}, one_hot=True)

In [21]:
X_sparse = csr_matrix(X.to_coo())

In [22]:
cv = SequenceKFold(lengths=lengths, n_folds=7)
i = 1
scores = []
fscores = []
print('Fold\tAccuracy\tF-score')
for train_ind, train_len, test_ind, test_len in cv:
    hmm = MultinomialHMM()
    
    train_x = X_sparse[train_ind]
    train_y = y[train_ind]
    
    hmm.fit(train_x, y=train_y, lengths=train_len)
    
    test_x = X_sparse[test_ind]
    test_y = y[test_ind]
    score = hmm.score(test_x, test_y, test_len)

    pred = hmm.predict(test_x, test_len)
    str_true = np.asarray(pd.Categorical.from_codes(test_y, train[TARGET].cat.categories), dtype=str)
    str_pred = np.asarray(pd.Categorical.from_codes(pred, train[TARGET].cat.categories), dtype=str)

    fscore = bio_f_score(str_true, str_pred)
    fscores.append(fscore)

    print(f'{i}\t{score}\t{fscore}')
    scores.append(score)
    i += 1
print(f'CV accuracy: {np.mean(scores)}')

print(f'CV bio f-score: {np.mean(fscores)}')

Fold	Accuracy	F-score
1	0.854728880119165	0.5355923788245516
2	0.855773879745489	0.5421960351071012
3	0.8530248490478403	0.531870590794667
4	0.8518389570700476	0.5247295691865191
5	0.8571318001954316	0.5334540007489702
6	0.8575255040371319	0.5370654239990189
7	0.8574758671386842	0.5441435865697045
CV accuracy: 0.8553571053362558
CV bio f-score: 0.5355787978900761


In [5]:
y_test.shape

NameError: name 'y_test' is not defined