In [1]:
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn import model_selection, multiclass, metrics, ensemble
from feature import get_features

test = pd.read_csv('gendered-pronoun-resolution/test_stage_1.tsv', delimiter='\t').rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
test.shape

(2000, 9)

In [2]:
gh_test = pd.read_csv("gendered-pronoun-resolution/gap-test.tsv", delimiter='\t')
gh_valid = pd.read_csv("gendered-pronoun-resolution/gap-validation.tsv", delimiter='\t')
gh_devop = pd.read_csv("gendered-pronoun-resolution/gap-development-revised.tsv", delimiter='\t')
train = pd.concat((gh_test, gh_valid, gh_devop)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True)
train.shape

(4454, 11)

In [3]:
para = -0.18
train = get_features(train, para)
test = get_features(test, para)

In [4]:
def get_label(df):
    df = df.rename(columns={'A-coref':'A', 'B-coref':'B'})
    df['A'] = df['A'].astype(int)
    df['B'] = df['B'].astype(int)
    df['NEITHER'] = 1.0 - (df['A'] + df['B'])
    
    def label_class(row):
        if row['A']==1:
            return 0
        if row['B']==1:
            return 1
        if row['NEITHER']==1:
            return 2
        else:
            print(row)
        
    df['class'] = df.apply(lambda row: label_class(row), axis = 1)
    return df

train = get_label(train)
gh_devop = gh_devop.rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
submition = get_label(gh_devop)

In [None]:
from sklearn.model_selection import GridSearchCV
col = ['pronoun_type', 'pred_A', 'after_A', 'pred_B', 'after_B', 
       'head_A', 'nsubj_A', 'dobj_A', 'pobj_A', 'poss_A', 'paral_A', 'ad_A', 'nonad_A', 
       'head_B', 'nsubj_B', 'dobj_B', 'pobj_B', 'poss_B', 'paral_B', 'ad_B', 'nonad_B', 
       'pos_sent_A', 'pos_sent_B', 'A-dist', 'B-dist']

model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(n_jobs=-1))
param_grid = dict(estimator__criterion=['entropy', 'gini'], estimator__max_depth=[10,11,12], 
                  estimator__n_estimators=[1000, 1500], estimator__min_samples_split=[6,7,8],
                  estimator__oob_score=['True', 'False'])
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_log_loss', cv=3, n_jobs=-1)
grid_result = grid.fit(train[col].fillna(-1), train[['A', 'B', 'NEITHER']])
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_estimator_))

In [6]:
col = ['pronoun_type', 'pred_A', 'after_A', 'pred_B', 'after_B', 
       'head_A', 'nsubj_A', 'dobj_A', 'pobj_A', 'poss_A', 'paral_A', 'ad_A', 'nonad_A', 
       'head_B', 'nsubj_B', 'dobj_B', 'pobj_B', 'poss_B', 'paral_B', 'ad_B', 'nonad_B', 
       'pos_sent_A', 'pos_sent_B', 'A-dist', 'B-dist']

et1 = ensemble.ExtraTreesClassifier(min_samples_split=7, criterion= 'entropy', max_depth=10, n_estimators=1500, random_state=22)
et2 = ensemble.ExtraTreesClassifier(min_samples_split=7, max_depth=10, n_estimators=1500, random_state=22)

rmf1 = ensemble.RandomForestClassifier(n_estimators=1000, max_depth=12, min_samples_split=8, 
                                       criterion='entropy', random_state=22)
rmf2 = ensemble.RandomForestClassifier(n_estimators=1500, max_depth=12, min_samples_split=8,  
                                       criterion='entropy', random_state=22)
rmf3 = ensemble.RandomForestClassifier(n_estimators=1000, max_depth=11, min_samples_split=7, 
                                       oob_score=True, criterion='entropy', random_state=22)
dict = {}
# for ind, modeli in enumerate([rmf1, rmf2, rmf3, rmf4, rmf5]):
for ind, modeli in enumerate([rmf1, rmf2, rmf3, et1, et2]):
    dict[ind] = {}
    dict[ind]["log_loss"]=[]
    dict[ind]["accuracy"]=[]
    print("n_estimators {}, max_depth {}, min_samples_split {}, criterion {}.".format(
        modeli.n_estimators, modeli.max_depth, modeli.min_samples_split, modeli.criterion))
    model = multiclass.OneVsRestClassifier(modeli)
    for rstate in [1, 2, 3, 4, 5]:
        print("random state {}".format(rstate))
        x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(-1), train[['A', 'B', 'NEITHER']], test_size=0.2, random_state=rstate)
        model.fit(x1, y1)
        logloss = metrics.log_loss(y2, model.predict_proba(x2))
        acc = model.score(x2, y2)
        print('log_loss', logloss)
        print('accuracy', acc)
        dict[ind]["log_loss"].append(logloss)
        dict[ind]["accuracy"].append(acc)
        
    model.fit(train[col].fillna(-1), train[['A', 'B', 'NEITHER']])
    results = model.predict_proba(test[col])
    print('log_loss', metrics.log_loss(submition[['A', 'B', 'NEITHER']], results))
    print('accuracy', model.score(test[col], submition[['A', 'B', 'NEITHER']]))
        
    print("Average {}".format(sum(dict[ind]["log_loss"]) / float(5)))
    print("Average {}\n".format(sum(dict[ind]["accuracy"]) / float(5)))

n_estimators 1000, max_depth 12, min_samples_split 8, criterion entropy.
random state 1
('log_loss', 0.6218139593636542)
('accuracy', 0.6666666666666666)
random state 2
('log_loss', 0.6556505964226502)
('accuracy', 0.6610549943883277)
random state 3
('log_loss', 0.6617122700016389)
('accuracy', 0.6498316498316499)
random state 4
('log_loss', 0.659407664907144)
('accuracy', 0.6610549943883277)
random state 5
('log_loss', 0.6199876857331559)
('accuracy', 0.6734006734006734)
('log_loss', 0.411771069308881)
('accuracy', 0.8175)
Average 0.643714435286
Average 0.662401795735

n_estimators 1500, max_depth 12, min_samples_split 8, criterion entropy.
random state 1
('log_loss', 0.6216759434721244)
('accuracy', 0.6689113355780022)
random state 2
('log_loss', 0.6545199661111505)
('accuracy', 0.6621773288439955)
random state 3
('log_loss', 0.6616853175470045)
('accuracy', 0.6520763187429854)
random state 4
('log_loss', 0.6591533774784148)
('accuracy', 0.6576879910213244)
random state 5
('log_loss'