In [1]:
import re
import numpy as np
import pandas as pd
from feature import get_features
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection, multiclass, metrics

In [None]:
gh_test = pd.read_csv("gendered-pronoun-resolution/gap-test.tsv", delimiter='\t')
gh_valid = pd.read_csv("gendered-pronoun-resolution/gap-validation.tsv", delimiter='\t')
gh_devop = pd.read_csv("gendered-pronoun-resolution/gap-development-revised.tsv", delimiter='\t')
train = pd.concat((gh_test, gh_valid, gh_devop)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True)
train.shape

(4454, 11)

In [3]:
para = -0.18
train = get_features(train, para)

In [4]:
def get_label(df):
    df = df.rename(columns={'A-coref':'A', 'B-coref':'B'})
    df['A'] = df['A'].astype(int)
    df['B'] = df['B'].astype(int)
    df['NEITHER'] = 1.0 - (df['A'] + df['B'])
    
    def label_class(row):
        if row['A']==1:
            return 0
        if row['B']==1:
            return 1
        if row['NEITHER']==1:
            return 2
        else:
            print(row)
        
    df['class'] = df.apply(lambda row: label_class(row), axis = 1)
    return df

train = get_label(train)

In [28]:
from sklearn.model_selection import GridSearchCV

model = GradientBoostingClassifier(max_features='sqrt', learning_rate=0.01, 
                                   n_estimators=800, random_state=37)

param_grid = dict(subsample=[0.77, 0.8], max_depth=[5, 6], 
                  min_samples_leaf=[4, 5],
                  min_samples_split=[8, 9, 10])

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_log_loss', cv=5, n_jobs=-1)
grid_result = grid.fit(train[col].fillna(0), train['class'])
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_estimator_))

Best: -0.652473 using GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=5,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_samples_split=8,
              min_weight_fraction_leaf=0.0, n_estimators=800,
              presort='auto', random_state=37, subsample=0.77, verbose=0,
              warm_start=False)


In [41]:
from sklearn.model_selection import GridSearchCV

model = GradientBoostingClassifier(max_features='sqrt', learning_rate=0.01, 
                                   max_depth=5, n_estimators=800, random_state=51)

param_grid = dict(subsample=[0.77, 0.78, 0.8], 
                  min_samples_leaf=[4, 5],
                  min_samples_split=[8, 9, 10])

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_log_loss', cv=5, n_jobs=-1)
grid_result = grid.fit(train[col].fillna(0), train['class'])
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_estimator_))

Best: -0.653122 using GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=5,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=4, min_samples_split=8,
              min_weight_fraction_leaf=0.0, n_estimators=800,
              presort='auto', random_state=51, subsample=0.77, verbose=0,
              warm_start=False)


In [20]:
col = ['pronoun_type', 'pred_A', 'after_A', 'pred_B', 'after_B', 
       'head_A', 'nsubj_A', 'dobj_A', 'pobj_A', 'poss_A', 'paral_A', 'ad_A', 'nonad_A', 
       'head_B', 'nsubj_B', 'dobj_B', 'pobj_B', 'poss_B', 'paral_B', 'ad_B', 'nonad_B', 
       'pos_sent_A', 'pos_sent_B', 'A-dist', 'B-dist']

clf1 = GradientBoostingClassifier(subsample=0.77, max_features='sqrt', max_depth=5, 
                                  min_samples_leaf=5, min_samples_split=8, 
                                  learning_rate=0.01, n_estimators=800, random_state=20)

clf2 = GradientBoostingClassifier(subsample=0.8, max_features='sqrt', max_depth=6, 
                                  min_samples_leaf=4, min_samples_split=10, 
                                  learning_rate=0.01, n_estimators=800, random_state=20)

clf3 = GradientBoostingClassifier(subsample=0.8, max_features='sqrt', max_depth=6, 
                                  min_samples_leaf=4, min_samples_split=9, 
                                  learning_rate=0.01, n_estimators=800, random_state=20)
dictc = {}
for clf in [clf1, clf2, clf3]:
    dictc[clf] = {}
    dictc[clf]["log_loss"]=[]
    dictc[clf]["accuracy"]=[]
    for rstate in [1, 2, 3, 4, 5]:
        x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(0), train['class'], test_size=0.2, random_state=rstate)
        clf.fit(x1, y1)
        y_pred = clf.predict_proba(x2)

        logloss = metrics.log_loss(y2, y_pred)
        acc = clf.score(x2, y2)
        print('log_loss', logloss)
        print('accuracy', acc)
        dictc[clf]["log_loss"].append(logloss)
        dictc[clf]["accuracy"].append(acc)
    
    print("Average {}".format(sum(dictc[clf]["log_loss"]) / float(5)))
    print("Average {}".format(sum(dictc[clf]["accuracy"]) / float(5)))
    print('---\n')


('log_loss', 0.6034104869463657)
('accuracy', 0.7418630751964085)
('log_loss', 0.6395317328603699)
('accuracy', 0.7295173961840629)
('log_loss', 0.6418225837766699)
('accuracy', 0.7295173961840629)
('log_loss', 0.6497989331254129)
('accuracy', 0.7227833894500562)
('log_loss', 0.6114020682185027)
('accuracy', 0.7306397306397306)
Average 0.629193160985
Average 0.730864197531
---

('log_loss', 0.5988410109888181)
('accuracy', 0.7418630751964085)
('log_loss', 0.6412555030547529)
('accuracy', 0.7239057239057239)
('log_loss', 0.6417969534145618)
('accuracy', 0.7261503928170595)
('log_loss', 0.6431063332482446)
('accuracy', 0.7317620650953984)
('log_loss', 0.6084233848159613)
('accuracy', 0.734006734006734)
Average 0.626684637104
Average 0.731537598204
---

('log_loss', 0.5985554346009012)
('accuracy', 0.7396184062850729)
('log_loss', 0.6418090975510642)
('accuracy', 0.7250280583613917)
('log_loss', 0.6422166232117205)
('accuracy', 0.7317620650953984)
('log_loss', 0.6443636210515464)
('accura