In [173]:
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers,CRF
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics
from sklearn.model_selection import cross_val_predict, cross_val_score

import pandas as pd
import numpy as np
import csv

In [174]:
def ez_pandas_reader(fname):
    df = pd.read_csv(fname)
    df = df.set_index("id")
    return df

In [175]:
def convert(path):
    vals = []
    with open(path, 'r',) as f:
        reader = csv.reader(f, delimiter='\t')
        temp = []
        next(reader)
        for line in enumerate(reader):
            if not line[1]:
                vals.append(temp)
                temp = []
            else:
                t = [ '{}'.format(x) for x in list(csv.reader([line[1][0]], delimiter=',', quotechar='"'))[0]]
                x = [t[1], "", t[2]]
                temp.append(tuple(x))
        vals.append(temp)
    return vals

In [176]:
def test_convert(path):
    vals = []
    with open(path, 'r',) as f:
        reader = csv.reader(f, delimiter='\t')
        temp = []
        next(reader)
        for line in enumerate(reader):
            if not line[1]:
                vals.append(temp)
                temp = []
            else:
                t = [ '{}'.format(x) for x in list(csv.reader([line[1][0]], delimiter=',', quotechar='"'))[0] ]
                temp.append(tuple([t[1],""]))
        vals.append(temp)
    return vals

In [177]:
train = convert("/Users/johnsonchan/Documents/Coding/Class/CMPSC 190I W/HW 4/HW4_twitter_ner_data/train.csv")

In [178]:
validate = convert("/Users/johnsonchan/Documents/Coding/Class/CMPSC 190I W/HW 4/HW4_twitter_ner_data/validation.csv")

In [179]:
test = test_convert("/Users/johnsonchan/Documents/Coding/Class/CMPSC 190I W/HW 4/HW4_twitter_ner_data/test_noans.csv")

In [180]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word.isspace=%s' % word.isspace(),
        'word.isupper=%s' % word.isupper(),
        'word.islower=%s' % word.islower(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    features.extend([
        'word.lower=' + word.lower(),
        'word.upper=' + word.upper(),
        'word.title=' + word.title(),
        'word[:2]=' + word[:2],
        'word[:3]=' + word[:3],
        'word[:4]=' + word[:4],
        'word[-4:]=' + word[-4:],
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
    ])
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    if i > 1:
        word2 = sent[i-2][0]
        postag2 = sent[i-2][1]
        
        # Features for the previous two words
        features.extend([
            '-2:word.lower=' + word2.lower(),
            '-2:word.istitle=%s' % word2.istitle(),
            '-2:word.isupper=%s' % word2.isupper(),
            '-2:postag=' + postag2,
            '-2:postag[:2]=' + postag2[:2],
        ])
    if i < len(sent) - 2:
        word3 = sent[i+2][0]
        postag3 = sent[i+2][1]
        
        # Features for the word two positions ahead
        features.extend([
            '+2:word.lower=' + word3.lower(),
            '+2:word.istitle=%s' % word3.istitle(),
            '+2:word.isupper=%s' % word3.isupper(),
            '+2:postag=' + postag3,
            '+2:postag[:2]=' + postag3[:2],
        ])
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [181]:
X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_test = [sent2features(s) for s in validate]
y_test = [sent2labels(s) for s in validate]

X_ans = [sent2features(s) for s in test]

In [182]:
labelToType = {
    'B-facility': 1,
    'I-facility': 11,
    'B-other': 5,
    'I-other': 15,
    'B-company': 0,
    'B-person': 6,
    'B-tvshow': 9,
    'B-sportsteam': 8,
    'I-person': 16,
    'B-geo-loc': 2,
    'B-movie': 3,
    'I-movie': 13,
    'I-tvshow': 19,
    'B-product': 7,
    'I-company': 10,
    'B-musicartist': 4,
    'I-musicartist': 14,
    'I-geo-loc': 12,
    'I-product': 17,
    'I-sportsteam': 18,
    'O' : 20
}

In [184]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=200,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.1),
    'c2': scipy.stats.expon(scale=0.01),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels, zero_division=1)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=10,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=100,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


KeyboardInterrupt: 

In [142]:
crf_new = rs.best_estimator_
y_pred = crf_new.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9456281672252516

In [143]:
y_pred_final = crf_new.predict(X_ans)

In [144]:
test_noans = ez_pandas_reader("/Users/johnsonchan/Documents/Coding/Class/CMPSC 190I W/HW 4/HW4_twitter_ner_data/test_noans.csv")

In [145]:
y_pred_to_label = []
count = 0
for i in y_pred_final:
    for j in i:
        y_pred_to_label.append(labelToType[j])
        count = count + 1

In [146]:
test_ans = test_noans.copy()
for i in range(0, len(y_pred_to_label)):
    test_ans.loc[i, "label"] = str(y_pred_to_label[i])
test_ans["label"].to_csv("/Users/johnsonchan/Documents/Coding/Class/CMPSC 190I W/HW 4/HW4_twitter_ner_data/test_answers.csv", header=True)