In [2]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

In [22]:
def get_sentence_dict():
    sentence_dict = {}
    with open('sentence_dict.json', 'r') as fp:
        sentence_dict = json.load(fp)
    return sentence_dict

In [23]:
df = pd.read_csv("ner_first_preprocessing.csv")

In [24]:
df = df[['sentence_idx', 'word', 'lemma','pos', 'tag']]
df.head()

Unnamed: 0,sentence_idx,word,lemma,pos,tag
0,1.0,Thousands,thousand,NNS,O
1,1.0,of,of,IN,O
2,1.0,demonstrators,demonstr,NNS,O
3,1.0,have,have,VBP,O
4,1.0,marched,march,VBN,O


### Keep only the word (or lemma) and the tag from sentence_dict:

In [25]:
def get_input_dict(lemma=False):
    sentence_dict = get_sentence_dict()
    input_dict = {}
    if lemma:
        idx = 2
    else:
        idx = 0
    for key, sentence in sentence_dict.items():
        new_sentence = []
        for word in sentence:
            new_sentence.append((word[idx], word[1], word[3]))
        input_dict[int(float(key))] = new_sentence
    return input_dict

In [26]:
def get_input_df(df, lemma=False):
    if lemma:
        df = df.drop(columns=['word'])
        df['lemma'] = df['lemma'].apply(lambda word: word.lower())
    else:
        df = df.drop(columns=['lemma'])
        df['word'] = df['word'].apply(lambda word: word.lower())
    df = df.drop(columns=['tag'])
    return df

In [27]:
df.head()

Unnamed: 0,sentence_idx,word,lemma,pos,tag
0,1.0,Thousands,thousand,NNS,O
1,1.0,of,of,IN,O
2,1.0,demonstrators,demonstr,NNS,O
3,1.0,have,have,VBP,O
4,1.0,marched,march,VBN,O


In [28]:
df_final = get_input_df(df)

In [29]:
df_final.head()

Unnamed: 0,sentence_idx,word,pos
0,1.0,thousands,NNS
1,1.0,of,IN
2,1.0,demonstrators,NNS
3,1.0,have,VBP
4,1.0,marched,VBN


In [32]:
output_final = df["tag"].values

In [36]:
tags = list(set(df["tag"].values))
tags.remove("O")
print(tags)

['B-gpe', 'I-org', 'I-art', 'I-geo', 'B-tim', 'B-art', 'B-geo', 'B-org', 'I-gpe', 'I-tim', 'I-nat', 'B-per', 'I-per', 'B-eve', 'B-nat', 'I-eve']


In [37]:
input_dict = get_input_dict()

In [38]:
input_dict[1]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [39]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [40]:
input_final = []
output_final = []
for idx, sentence in input_dict.items():
    input_final.append(sent2features(sentence))
    output_final.append(sent2labels(sentence))
    

In [41]:
X_train, X_test, y_train, y_test = train_test_split(input_final, output_final, test_size = 0.33, random_state=0)

In [45]:
def call_crf(X_train, X_test, y_train, y_test, tags,
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True):
    print(f"c1:{c1} | c2:{c2}")
    crf = sklearn_crfsuite.CRF(algorithm=algorithm, c1 = c1, c2=c2, max_iterations=max_iterations,
                               all_possible_transitions=all_possible_transitions)
    crf.fit(X_train, y_train)
    y_pred = crf.predict(X_test)
    metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=tags)
    with open("test.log", "a") as file:
        file.write("############################################################################\n")
        file.write(f"Algorithm: {algorithm}, c1: {c1}, c2: {c2}, max_iterations: {max_iterations}, all_possible_transitions: {all_possible_transitions}\n")
        file.write(metrics.flat_classification_report(y_test, y_pred, labels = tags))

In [46]:
c1 = np.arange(0.01, 0.5, 0.02)
c2 = np.arange(0.01, 0.5, 0.02)

In [None]:
for c11 in c1:
    for c22 in c2:
        call_crf(X_train, X_test, y_train, y_test, tags,
    algorithm='lbfgs',
    c1=c11,
    c2=c22,
    max_iterations=100,
    all_possible_transitions=True)