In [1]:
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load('en')
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection, metrics
from feature import get_features

test = pd.read_csv('gendered-pronoun-resolution/test_stage_1.tsv', delimiter='\t').rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
test.shape

(2000, 9)

In [2]:
gh_test = pd.read_csv("gendered-pronoun-resolution/gap-test.tsv", delimiter='\t')
gh_valid = pd.read_csv("gendered-pronoun-resolution/gap-validation.tsv", delimiter='\t')
gh_devop = pd.read_csv("gendered-pronoun-resolution/gap-development-revised.tsv", delimiter='\t')
data = pd.concat((gh_test, gh_valid, gh_devop)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True)
data.shape

(4454, 11)

In [3]:
para = -0.18
data = get_features(data, para)

In [4]:
def get_label(df):
    df = df.rename(columns={'A-coref':'A', 'B-coref':'B'})
    df['A'] = df['A'].astype(int)
    df['B'] = df['B'].astype(int)
    df['NEITHER'] = 1.0 - (df['A'] + df['B'])
    
    def label_class(row):
        if row['A']==1:
            return 0
        if row['B']==1:
            return 1
        if row['NEITHER']==1:
            return 2
        else:
            print(row)
        
    df['class'] = df.apply(lambda row: label_class(row), axis = 1)
    return df

data = get_label(data)

In [5]:
def get_f1(model, x_true, y_true):
    y_pred = model.predict_proba(x_true)
    f1 = metrics.f1_score(y_true, np.argmax(y_pred, axis=1), average="weighted")
    return f1

col = ['pronoun_type', 'pred_A', 'after_A', 'pred_B', 'after_B', 
       'head_A', 'nsubj_A', 'dobj_A', 'pobj_A', 'poss_A', 'paral_A', 'ad_A', 'nonad_A', 
       'head_B', 'nsubj_B', 'dobj_B', 'pobj_B', 'poss_B', 'paral_B', 'ad_B', 'nonad_B', 
       'pos_sent_A', 'pos_sent_B', 'A-dist', 'B-dist']

cate_features_index = np.where(data[col].nunique() <5)[0]

model_catboost = CatBoostClassifier(
    loss_function='MultiClass', 
    learning_rate=0.01, num_boost_round=3500, 
    max_depth=8, random_seed=42, verbose=False)

model_gbc = GradientBoostingClassifier(
    subsample=0.8, max_features='sqrt', max_depth=6, 
    min_samples_leaf=4, min_samples_split=9, 
    learning_rate=0.01, n_estimators=800, random_state=20)

x1, x2, y1, y2 = model_selection.train_test_split(
    data[col].fillna(0), 
    data['class'], test_size=0.2, random_state=3)

model_catboost.fit(x1, y1, cat_features=cate_features_index)
print('CatBoost Classifier F1 score: {}'.format(get_f1(model_catboost, x2, y2)))
model_gbc.fit(x1, y1)
print('Gradient Boosting Classifier F1 score: {}'.format(get_f1(model_gbc, x2, y2)))

CatBoost Classifier F1 score: 0.7473292611802561
Gradient Boosting Classifier F1 score: 0.7405657201314051
