In [1]:
import pandas as pd
import nltk
from itertools import chain
import pycrfsuite
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import sklearn


In [2]:
filename ="Hotels_train_phrase_below_reviews_data.xlsx"
df = pd.read_excel(filename,"phrases")

In [3]:
df.head()

Unnamed: 0,theme,index,sentiment,Phrases,Unnamed: 4
0,Game drive experience,10,positive,Game Drives are the best,5
1,Quality of food,1,positive,the food was delicious,4
2,Safari experience,17,Neutral,Africa in the past 35 years and never actually...,29
3,Safari experience,17,Neutral,Africa in the past 35 years and never actually...,29
4,Quality of food,1,positive,"peaceful lunch on the lawn,",5


In [4]:
from nltk import word_tokenize
def extract_pos(sent):
    try:
        tokens = word_tokenize(sent)
        return nltk.pos_tag(tokens)
    except:
        import pdb
        pdb.set_trace()

In [5]:
import pdb
def extract_BIO2_pos(sent):
    try:
        tokens = word_tokenize(sent['Phrases'])
        pos_tags = nltk.pos_tag(tokens)
        counter = 1
        BIO2 = []
        for token in tokens:
            pos = (nltk.pos_tag([token]))[0][1]
            if counter == 1:
                theme = "B-"+str(sent['index'])
                BIO2.append((token, pos, theme))
                counter = 0
            else:
                pos = (nltk.pos_tag([token]))[0][1]
                theme = "I-"+str(sent['index'])
                BIO2.append((token, pos, theme))
#                 BIO2token, pos, theme
    except:
        pdb.set_trace()
    return BIO2

In [6]:
pos_tags = df[['Phrases','index']].apply(extract_BIO2_pos,axis=1)

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]    

In [8]:
%%time
pos_tags = pos_tags.tolist()
train_sents = pos_tags[:4960]
test_sents = pos_tags[4961:]
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

CPU times: user 365 ms, sys: 36.6 ms, total: 401 ms
Wall time: 406 ms


In [9]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [10]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})


In [11]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [12]:
trainer.train('hotels.crfsuite')

In [13]:
tagger = pycrfsuite.Tagger()
tagger.open('hotels.crfsuite')

<contextlib.closing at 0x116171590>

In [14]:
example_sent = test_sents[0]
# print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

('Predicted:', 'B-3 I-3 I-3 I-3 I-3 I-3')
('Correct:  ', 'B-3 I-3 I-3 I-3 I-3 I-3')


In [18]:
from sklearn.preprocessing import LabelBinarizer
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [19]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [20]:
print(bio_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        B-0       0.57      0.77      0.66        53
        I-0       0.62      0.77      0.69       430
        B-1       0.46      1.00      0.63        31
        I-1       0.40      1.00      0.57       136
       B-10       0.61      0.69      0.65        36
       I-10       0.62      0.69      0.65       318
       B-11       0.52      0.44      0.48        55
       I-11       0.49      0.36      0.42       350
       B-12       0.82      0.74      0.78        50
       I-12       0.74      0.68      0.71       196
       B-13       0.81      0.71      0.75        41
       I-13       0.83      0.71      0.77       417
       B-14       0.59      0.59      0.59        22
       I-14       0.64      0.63      0.63       187
       B-15       0.93      0.62      0.74        42
       I-15       0.90      0.66      0.77       280
       B-16       0.88      0.25      0.39        28
       I-16       0.86      0.23      0.36   

  'precision', 'predicted', average, warn_for)


In [21]:
y_pred[0]

['B-3', 'I-3', 'I-3', 'I-3', 'I-3', 'I-3']

In [22]:
def removeBIO_and_merge(y_values):
    all_tags = []
    for values_ in y_values:
        set_values = set()
        for value in values_:
            value = int(value[2:])
            set_values.add(value)
        all_tags.append(list(set_values))
        
    return all_tags

In [23]:
y_pred = removeBIO_and_merge(y_pred)
y_test = removeBIO_and_merge(y_test)
mlb = MultiLabelBinarizer()
y_test_lb = mlb.fit_transform((y_test))
y_pred_lb = mlb.transform((y_pred))

In [24]:
len(y_pred_lb)
print classification_report(y_test_lb,y_pred_lb)

             precision    recall  f1-score   support

          0       0.57      0.77      0.66        53
          1       0.46      1.00      0.63        31
          2       0.53      0.85      0.65        86
          3       0.40      0.71      0.51       116
          4       0.81      0.74      0.77        23
          5       0.52      0.63      0.57        97
          6       0.92      0.79      0.85        75
          7       0.85      0.63      0.72        54
          8       0.88      0.66      0.75        65
          9       0.87      0.87      0.87        75
         10       0.62      0.72      0.67        36
         11       0.51      0.44      0.47        55
         12       0.83      0.78      0.80        50
         13       0.81      0.71      0.75        41
         14       0.59      0.59      0.59        22
         15       0.90      0.64      0.75        42
         16       0.78      0.25      0.38        28
         17       0.83      0.70      0.76   

In [25]:
len(y_test_lb)

1171

In [None]:
list(chain.from_iterable([[1,2],[3]]))