In [2]:
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.tokenize import sent_tokenize, word_tokenize
from catboost import Pool, CatBoostClassifier, cv
from sklearn import model_selection, metrics
from feature import get_features

test = pd.read_csv('gendered-pronoun-resolution/test_stage_1.tsv', delimiter='\t').rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
test.shape

(2000, 9)

In [3]:
gh_test = pd.read_csv("gendered-pronoun-resolution/gap-test.tsv", delimiter='\t')
gh_valid = pd.read_csv("gendered-pronoun-resolution/gap-validation.tsv", delimiter='\t')
gh_devop = pd.read_csv("gendered-pronoun-resolution/gap-development-revised.tsv", delimiter='\t')
train = pd.concat((gh_test, gh_valid, gh_devop)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True)
train.shape

(4454, 11)

In [4]:
para = -0.18
train = get_features(train, para)
test = get_features(test, para)

In [5]:
def get_label(df):
    df = df.rename(columns={'A-coref':'A', 'B-coref':'B'})
    df['A'] = df['A'].astype(int)
    df['B'] = df['B'].astype(int)
    df['NEITHER'] = 1.0 - (df['A'] + df['B'])
    
    def label_class(row):
        if row['A']==1:
            return 0
        if row['B']==1:
            return 1
        if row['NEITHER']==1:
            return 2
        else:
            print(row)
        
    df['class'] = df.apply(lambda row: label_class(row), axis = 1)
    return df

train = get_label(train)
gh_devop = gh_devop.rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})
submition = get_label(gh_devop)

In [6]:
col = ['pronoun_type', 'pred_A', 'after_A', 'pred_B', 'after_B', 
       'head_A', 'nsubj_A', 'dobj_A', 'pobj_A', 'poss_A', 'paral_A', 'ad_A', 'nonad_A', 
       'head_B', 'nsubj_B', 'dobj_B', 'pobj_B', 'poss_B', 'paral_B', 'ad_B', 'nonad_B', 
       'pos_sent_A', 'pos_sent_B', 'A-dist', 'B-dist']

cate_features_index = np.where(train[col].nunique() <5)[0]

dict_lgb = {}
dict_lgb["log_loss"]=[]
dict_lgb["accuracy"]=[]
for rstate in [1, 2, 3, 4, 5]:
    x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(0), 
                                                      train['class'], test_size=0.2, random_state=rstate)

    model = CatBoostClassifier(loss_function='MultiClass', 
                               learning_rate=0.01, num_boost_round=3500, 
                               max_depth=8, random_seed=42, verbose=0)
    model.fit(x1, y1, cat_features=cate_features_index)
    y_pred = model.predict_proba(x2)
    logloss = metrics.log_loss(y2, y_pred)
    acc = metrics.accuracy_score(y2, np.argmax(y_pred, axis=1))
    print('log_loss', logloss)
    print('accuracy', acc)
    dict_lgb["log_loss"].append(logloss)
    dict_lgb["accuracy"].append(acc)
    
print("Average {}".format(sum(dict_lgb["log_loss"]) / float(5)))
print("Average {}".format(sum(dict_lgb["accuracy"]) / float(5)))

('log_loss', 0.6009626808451145)
('accuracy', 0.7441077441077442)
('log_loss', 0.6518246162781429)
('accuracy', 0.7227833894500562)
('log_loss', 0.6336828848932201)
('accuracy', 0.7474747474747475)
('log_loss', 0.6354177202041342)
('accuracy', 0.7261503928170595)
('log_loss', 0.6084023756897176)
('accuracy', 0.7328843995510662)
Average 0.626058055582
Average 0.73468013468


In [None]:
from sklearn.model_selection import GridSearchCV

gridParams = {
    'max_depth':[7, 8, 9, 10], 
}

# Create classifier.
mdl = CatBoostClassifier(loss_function='MultiClass', 
                         cat_features=cate_features_index, 
                         random_seed=42, verbose=0)

grid = GridSearchCV(mdl, gridParams,
                    verbose=0,
                    scoring='neg_log_loss',
                    cv=5,
                    n_jobs=-1)
# Run the grid
grid.fit(train[col].fillna(0), train['class'])

In [None]:
print("Best: %f using" % (grid.best_score_))
print(grid.best_estimator_.get_params())