In [1]:
%load_ext autoreload
    
%autoreload 2

In [2]:
%cd '../../'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.tagsets.utils import same_grammemmes
from anaphoralib.experiments import mentionpair
from anaphoralib.experiments import coref_utils
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'
#%load_ext autoreload
#%autoreload 2

scorer_path = 'rucoref/external/reference-coreference-scorers/scorer.pl'

/Users/cnst/Projects/coref/code_sources/rucoref
/Users/cnst/Projects/coref/code_sources





# Utility functions for splitting dataset onto train-test pair

In [3]:
import csv
import numpy as np
import pandas as pd

from numpy import linalg as la, random as rnd

DEBUG_SEED = 42
DEFAULT_CSV_PARAMS = {
    "sep": '\t',
    "quoting": csv.QUOTE_NONE
}
FNAMES = ["Corpus-2015/Tokens.txt", "Corpus-2015/Groups.txt"]
TRAIN_RATE = 0.7
SPLIT_COLUMN = "doc_id"

In [28]:
def read_pandas_csv(fname, **csv_params):
    return pd.read_csv(fname, **csv_params)


def write_pandas_csv(df, fname, **csv_params):
    df.to_csv(fname, index=False, **csv_params)


def train_test_split(data, train_rate=TRAIN_RATE, split_column=SPLIT_COLUMN, seed=None):
    multiple_df = True
    if train_rate <= 0.0 or train_rate >= 1.0:
        raise ValueError("train_rate must lie in (0, 1), but got {}".format(train_rate))
    if isinstance(data, pd.DataFrame):
        multiple_df = False
        data = (data,)
    for df in data:
        if split_column not in df.columns:
            raise ValueError("split_column {} must be valid column in dataframe, but got {} only"\
                             .format(split_column, list(df.columns)))
    unique_df_indices = set(data[0][split_column].unique())
    for df in data[1:]:
        if unique_df_indices != set(df[split_column].unique()):
            raise ValueError("all dataframes must have equal unique indices.")
    
    unique_df_indices = data[0][split_column].unique()
    if seed is not None:
        prng = rnd.RandomState(seed)
    else:
        prng = rnd.RandomState()
    
    train_mask = np.array(prng.binomial(1.0, train_rate, size=len(unique_df_indices)), dtype=np.bool)
    train_indices = unique_df_indices[train_mask]
    test_indices = unique_df_indices[~train_mask]
    train_data, test_data = (), ()
    for df in data:
        train_data += (df.loc[df[split_column].isin(train_indices)], )
        test_data += (df.loc[df[split_column].isin(test_indices)], )
    return (train_data, test_data) if multiple_df else (train_data[0], test_data[0])

def save_train_test(train_data, test_data, fnames, **csv_params):
    multiple_df = True
    if isinstance(train_data, pd.DataFrame):
        multiple_df = False
        train_data, test_data, fnames = (train_data,), (test_data,), (fnames,)
    names, exts = tuple(".".join(fname.split(".")[:-1]) for fname in fnames),\
                    tuple(fname.split(".")[-1] for fname in fnames)
    for train_df, test_df, name, ext in zip(train_data, test_data, names, exts):
        write_pandas_csv(train_df, fname=".".join([name, "train", ext]), **csv_params)
        write_pandas_csv(test_df, fname=".".join([name, "test", ext]), **csv_params)

def train_test_file_partition(fnames, train_rate=TRAIN_RATE, split_column=SPLIT_COLUMN, seed=DEBUG_SEED, **csv_params):
    if not csv_params:
        global DEFAULT_CSV_PARAMS
        csv_params = DEFAULT_CSV_PARAMS
    data = tuple(read_pandas_csv(fname, **DEFAULT_CSV_PARAMS) for fname in fnames)
    train_data, test_data = train_test_split(data, train_rate=train_rate, split_column=split_column, seed=seed)
    save_train_test(train_data, test_data, fnames, **csv_params)

In [27]:
train_test_file_partition(FNAMES, train_rate=TRAIN_RATE, split_column=SPLIT_COLUMN, seed=DEBUG_SEED)

181


# Evaluation of github code
 load train and test corpuses
-------------------------------------

In [6]:
rucoref_train = rueval.RuCorefCorpus(multeast, rueval)
rucoref_test = rueval.RuCorefCorpus(multeast, rueval)

In [7]:
exp_utils.load_corpus(rucoref_train, 'Corpus-2015/Tokens.train.txt', 'Corpus-2015/Groups.train.txt')

D|	filename: Corpus-2015/Tokens.train.txt
D|	fields: ['doc_id', 'shift', 'length', 'token', 'lemma', 'gram']
D|	next word: {'gram': 'Sp-a', 'doc_id': '1', 'lemma': 'во', 'token': 'Во', 'shift': '0', 'length': '2'}
D|	next word: {'gram': 'Vmis-sma-e', 'doc_id': '16', 'lemma': 'мочь', 'token': 'мог', 'shift': '2114', 'length': '3'}
D|	next word: {'gram': '-', 'doc_id': '28', 'lemma': '-', 'token': '-', 'shift': '5300', 'length': '1'}
D|	next word: {'gram': 'Vmps-sfpsp', 'doc_id': '81', 'lemma': 'замешать', 'token': 'замешана', 'shift': '2891', 'length': '8'}
D|	next word: {'gram': 'P--nsnn', 'doc_id': '98', 'lemma': 'это', 'token': 'Это', 'shift': '5199', 'length': '3'}
D|	next word: {'gram': '-', 'doc_id': '134', 'lemma': ')', 'token': ')', 'shift': '10371', 'length': '1'}
D|	next word: {'gram': 'Afpmpaf', 'doc_id': '251', 'lemma': 'широкий', 'token': 'широкие', 'shift': '7341', 'length': '7'}
D|	next word: {'gram': 'Mc---d', 'doc_id': '268', 'lemma': '@card@', 'token': '15', 'shift': '

In [8]:
exp_utils.load_corpus(rucoref_test, 'Corpus-2015/Tokens.test.txt', 'Corpus-2015/Groups.test.txt')

D|	filename: Corpus-2015/Tokens.test.txt
D|	fields: ['doc_id', 'shift', 'length', 'token', 'lemma', 'gram']
D|	next word: {'gram': '-', 'doc_id': '2', 'lemma': '-', 'token': '-', 'shift': '0', 'length': '1'}
D|	next word: {'gram': 'Sp-a', 'doc_id': '69', 'lemma': 'на', 'token': 'на', 'shift': '1167', 'length': '2'}
D|	next word: {'gram': '-', 'doc_id': '117', 'lemma': '"', 'token': '"', 'shift': '916', 'length': '1'}
D|	next word: {'gram': 'Sp-d', 'doc_id': '277', 'lemma': 'по', 'token': 'по', 'shift': '1348', 'length': '2'}
D|	next word: {'gram': 'Ncfsnn', 'doc_id': '323', 'lemma': 'речь', 'token': 'речь', 'shift': '10478', 'length': '4'}
D|	load_syntax: False
D|	self.parses: None
Number of texts: 51
Number of GS texts: 51
Number of chains in a corpus: 1053
Number of words in all chains: 4511


Defining the classifiers
--------------------

In [9]:
class BaselineAllSingletonsClassifier(mentionpair.MentionPairClassifier):
    def pair_coreferent(self, pair, groups, words):
        return False

In [10]:
class BaselineAllInOneClassifier(mentionpair.MentionPairClassifier):
    def pair_coreferent(self, pair, groups, words):
        return True

In [11]:
class BaselineStrMatchClassifier(mentionpair.MentionPairClassifier):
    def pair_coreferent(self, pair, groups, words):
        is_pronoun = rucoref_test.tagset.pos_filters['pronoun'](pair[0])
        is_personal_pronoun = rucoref_test.tagset.extract_feature('person', pair[0]) in ('1', '2')
        
        return (not is_pronoun or is_personal_pronoun) and ' '.join(pair[0].lemma) == ' '.join(pair[1].lemma)

In [12]:
class BaselineHeadMatchClassifier(mentionpair.MentionPairClassifier):
    def pair_coreferent(self, pair, groups, words):
        is_pronoun = rucoref_test.tagset.pos_filters['pronoun'](pair[0])
        is_personal_pronoun = rucoref_test.tagset.extract_feature('person', pair[0]) in ('1', '2')
        
        return (not is_pronoun or is_personal_pronoun) and pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head]

In [13]:
class BaselineHeadMatchProClassifier(mentionpair.MentionPairClassifier):
    def __init__(self, scorer_path):
        super(BaselineHeadMatchProClassifier, self).__init__(scorer_path)
        self.groups_match = lambda pair: pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head]
    def pair_coreferent(self, pair, groups, words):
        tagset = rucoref_test.tagset
        
        is_pronoun = lambda w: tagset.pos_filters['pronoun'](w)
        is_deictic_pronoun = lambda w: tagset.extract_feature('person', w) in ('1', '2')
        
        number_agrees = lambda p: same_grammemmes('number', p, tagset)
        gender_agrees = lambda p: same_grammemmes('gender', p, tagset)
        
        if is_pronoun(pair[1]):
            heads = [np.words[np.head] if np.type != 'word' else np for np in pair]
            heads_indices = [words.index(head) for head in heads]

            nouns_agr_between = [word for word in words[heads_indices[0]+1:heads_indices[1]]
                                     if tagset.pos_filters['noun'](word)
                                     and number_agrees((word, pair[1]))
                                     and gender_agrees((word, pair[1]))
                                ]
        
        return (
                (is_deictic_pronoun(pair[0]) and self.groups_match(pair))
               or
                (not is_pronoun(pair[0]) and pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head])
               or
               (
                not is_pronoun(pair[0]) and is_pronoun(pair[1])
                and number_agrees(pair)
                and gender_agrees(pair)
                and len(nouns_agr_between) == 0
               )
        )

In [14]:
class BaselineStrMatchProClassifier(BaselineHeadMatchProClassifier):
    def __init__(self, scorer_path):
        super(BaselineStrMatchProClassifier, self).__init__(scorer_path)
        self.groups_match = lambda pair: ' '.join(pair[0].lemma) == ' '.join(pair[1].lemma)

In [15]:
good_pronouns = {'я', 'мы', 
                 'ты', 'вы', 
                 'он', 'она', 'оно', 'они', 
                 'мой', 'наш', 
                 'твой', 'ваш', 
                 'его', 'ее', 'их',
                 'себя', 'свой',
                 'который'
                }
group_ok = lambda g: g.tag.startswith('N') or (g.tag.startswith('P') and g.lemma[0] in good_pronouns)

In [16]:
gs_mentions, gs_group_ids = coref_utils.get_gs_groups(rucoref_test)
gs_groups = gs_mentions

pred_mentions, pred_group_ids = coref_utils.get_pred_groups(rucoref_test, group_ok)
pred_groups = rucoref_test.groups

pred_mentions_gold_bound, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_test, group_ok)
pred_groups_gold_bound = rucoref_test.groups

In [17]:
print(len(gs_mentions[1]))
print(len(pred_mentions[1]))
print(len(pred_mentions_gold_bound[1]))

106
237
237


In [None]:
gg = pred_mentions_gold_bound[0][0]

In [None]:
ww = gg.words[0]

In [None]:
ww.wordform[0]

In [18]:
pred_mentions_gold_bound[0][:150]

[Елена Сергеевна(Npfsny, 16),
 Старая учительница(Ncfsny, 38),
 глаза(Ncmpan, 78),
 нею(P-3fsin, 91),
 невысокий молодой человек(Afpmsnf, 101),
 Он(P-3msnn, 128),
 нее(P-3fsan, 142),
 она(P-3fsnn, 167),
 смешное мальчишеское выражение глаз(Ncnsnn, 183),
 его(P-----a, 233),
 Дементьев(Npmsny, 243),
 она(P-3fsnn, 264),
 я(P-1-snn, 300),
 человек(Ncmsny, 312),
 Она(P-3fsnn, 339),
 он(P-3msnn, 354),
 нею(P-3fsin, 372),
 Дементьев(Npmsny, 403),
 он(P-3msnn, 444),
 театре(Ncmsln, 452),
 Я(P-1-snn, 460),
 актер(Ncmsny, 462),
 Актер(Ncmsny, 469),
 бытовые роли(Ncfpan, 478),
 Я(P-1-snn, 579),
 она(P-3fsnn, 610),
 четвертый класс(Ncmsan, 633),
 удивительные ребята(Ncmpnn, 662),
 Она(P-3fsnn, 738),
 упавшим голосом(Ncmsin, 768),
 комнату новую(Ncfsan, 794),
 двухкомнатной квартире(Ncfsln, 818),
 рай(Ncmsnn, 851),
 ее(P-----a, 870),
 голосе(Ncmsln, 873),
 Дементьева(Npmsay, 892),
 Елена Сергеевна(Npfsny, 940),
 он(P-3msnn, 966),
 комната(Ncfsnn, 1010),
 лифта(Ncmsgn, 1046),
 я(P-1-snn, 1071),
 Дир

In [19]:
gs_mentions_train, gs_group_ids_train = coref_utils.get_gs_groups(rucoref_train)
gs_groups_train = gs_mentions_train

pred_mentions_train, pred_group_ids_train = coref_utils.get_pred_groups(rucoref_train, group_ok)
pred_groups_train = rucoref_train.groups

pred_mentions_gold_bound_train, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_train, group_ok)
pred_groups_gold_bound_train = rucoref_train.groups

Testing the baseline classifiers:

In [20]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)

\textsc{BaselineAllInOneClassifier} &  $99.95$  & $77.45$ & $99.94$ & $87.26$  & $11.66$ & $99.92$ & $20.88$  & $24.31$ \\
\textsc{BaselineAllSingletonsClassifier} &  $100.00$  & $0.00$ & $0.00$ & $0.00$  & $100.00$ & $23.39$ & $37.91$  & $23.39$ \\
\textsc{BaselineStrMatchClassifier} &  $100.00$  & $92.10$ & $36.21$ & $51.98$  & $96.16$ & $37.71$ & $54.18$  & $44.96$ \\
\textsc{BaselineStrMatchProClassifier} &  $99.95$  & $85.68$ & $52.72$ & $65.27$  & $89.87$ & $45.45$ & $60.37$  & $51.42$ \\
\textsc{BaselineHeadMatchClassifier} &  $99.95$  & $87.00$ & $46.07$ & $60.24$  & $91.85$ & $43.28$ & $58.84$  & $50.52$ \\
\textsc{BaselineHeadMatchProClassifier} &  $99.95$  & $85.68$ & $52.75$ & $65.30$  & $89.87$ & $45.46$ & $60.38$  & $51.44$ \\


In [21]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)

\textsc{BaselineAllInOneClassifier} &  $48.81$  & $25.09$ & $85.39$ & $38.79$  & $1.48$ & $82.03$ & $2.91$  & $11.75$ \\
\textsc{BaselineAllSingletonsClassifier} &  $48.83$  & $0.00$ & $0.00$ & $0.00$  & $33.74$ & $21.19$ & $26.03$  & $12.65$ \\
\textsc{BaselineStrMatchClassifier} &  $48.83$  & $41.84$ & $27.95$ & $33.52$  & $31.84$ & $31.78$ & $31.81$  & $22.26$ \\
\textsc{BaselineStrMatchProClassifier} &  $48.83$  & $33.73$ & $42.94$ & $37.78$  & $27.05$ & $39.21$ & $32.02$  & $25.64$ \\
\textsc{BaselineHeadMatchClassifier} &  $48.83$  & $32.86$ & $36.85$ & $34.74$  & $29.22$ & $36.61$ & $32.50$  & $25.01$ \\
\textsc{BaselineHeadMatchProClassifier} &  $48.83$  & $33.73$ & $42.94$ & $37.78$  & $27.05$ & $39.21$ & $32.02$  & $25.64$ \\


In [22]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)

\textsc{BaselineAllInOneClassifier} &  $34.00$  & $15.86$ & $54.01$ & $24.52$  & $0.91$ & $46.34$ & $1.78$  & $8.31$ \\
\textsc{BaselineAllSingletonsClassifier} &  $34.00$  & $0.00$ & $0.00$ & $0.00$  & $23.49$ & $14.51$ & $17.94$  & $11.14$ \\
\textsc{BaselineStrMatchClassifier} &  $34.00$  & $28.92$ & $17.97$ & $22.17$  & $21.73$ & $20.43$ & $21.06$  & $17.41$ \\
\textsc{BaselineStrMatchProClassifier} &  $34.00$  & $20.75$ & $25.40$ & $22.84$  & $17.99$ & $23.85$ & $20.51$  & $18.82$ \\
\textsc{BaselineHeadMatchClassifier} &  $34.00$  & $19.81$ & $21.72$ & $20.72$  & $20.11$ & $21.88$ & $20.96$  & $18.55$ \\
\textsc{BaselineHeadMatchProClassifier} &  $34.00$  & $20.75$ & $25.40$ & $22.84$  & $17.99$ & $23.85$ & $20.51$  & $18.82$ \\


In [23]:
scores, groups, chains_base = BaselineHeadMatchProClassifier(scorer_path).score(rucoref_test, 
                                                                                pred_mentions_gold_bound, 
                                                                                pred_groups_gold_bound, 
                                                                                metrics=('muc',), heads_only=False)

In [24]:
coref_utils.print_chains_in_text(rucoref_test, 1, chains_base, pred_mentions_gold_bound)

-- SYS --
Неприятности:неприятность(Ncfpnn, 5)
разные неприятности:разный неприятность(Ncfpnn, 1572)

старом деревенском доме:старый деревенский дом(Ncmsln, 52)
дом:дом(Ncmsan, 2780)
домом:дом(Ncmsin, 2976)
дома:дом(Ncmsgn, 3969)
старый дом:старый дом(Ncmsnn, 4080)
дом:дом(Ncmsan, 4867)

Фунтик:фунтик(Ncmsnn, 104)
Фунтик:фунтик(Npmsny, 2051)
Фунтик:фунтик(Npmsny, 2702)
Фунтик:фунтик(Npmsny, 2954)

Фунтика:фунтика(Ncfsnn, 112)
Фунтика:фунтика(Npmsay, 697)
Фунтика:фунтика(Ncmsgn, 1089)
Фунтика:фунтика(Npmsay, 2209)
Фунтика:фунтика(Npmsay, 2632)
его:он(P-3msan, 2652)

черный кот Степан:черный кот степан(Ncmsny, 154)
всех котов:весь кот(Ncmpgy, 1121)

несколько часов:несколько часы(Nc--a, 2982)
нескольку часов:несколько часы(Nc--d, 3459)

вечеру:вечер(Ncmsdn, 3011)
вечерам:вечер(Ncmpdn, 4838)

забор:забор(Ncmsnn, 1058)
высокому забору:высокий забор(Ncmsdn, 1373)

обслюненной лапой:обслюненной лапа(Ncfsin, 300)
себя:себя(P----gn, 320)

ухом:ухо(Ncnsin, 328)
ухо:ухо(Ncnsan, 438)
Одно ухо у н

In [None]:
rucoref_train.export_conll('kek.txt')

In [None]:
rm kek

In [25]:
rucoref_train.export_close_conll("kek_closed.txt")