In [1]:
import os
import csv
import json
import numpy as np
import pandas as pd
import intervaltree

from collections import defaultdict
from numpy import linalg as la, random as rnd




In [2]:
#!mkdir Corpus-2015/formatted_data

In [3]:
#!mkdir Corpus-2015/formatted_data/data_raw
#!mkdir Corpus-2015/formatted_data/gold

In [10]:
file_path = 'Corpus-2015/parsed_testset/fiction/102_beliajev_nad_bezdnoj.conll'
groups_path = 'Corpus-2015/Groups.txt'
tokens_path = 'Corpus-2015/Tokens.txt'

text_out_path = 'Corpus-2015/Texts.txt'

path_data_raw = 'Corpus-2015/formatted_data/data_raw'
path_gold = 'Corpus-2015/formatted_data/gold'
formatted_file_fname = "train"

DEBUG_SEED = 42
DEFAULT_CSV_PARAMS = {
    "sep": '\t',
    "quoting": csv.QUOTE_NONE
}
FNAMES = ["Corpus-2015/Tokens.txt", "Corpus-2015/Groups.txt"]
TRAIN_RATE = 0.8
DEV_RATE = 0.2
SPLIT_COLUMN = "doc_id"

In [23]:
def read_pandas_csv(fname, **csv_params):
    return pd.read_csv(fname, **csv_params)


def write_pandas_csv(df, fname, **csv_params):
    df.to_csv(fname, index=False, **csv_params)
    return None


def train_test_split(data, train_rate=TRAIN_RATE, split_column=SPLIT_COLUMN, seed=None):
    multiple_df = True
    if train_rate <= 0.0 or train_rate >= 1.0:
        raise ValueError("train_rate must lie in (0, 1), but got {}".format(train_rate))
    if isinstance(data, pd.DataFrame):
        multiple_df = False
        data = (data,)
    for df in data:
        if split_column not in df.columns:
            raise ValueError("split_column {} must be valid column in dataframe, but got {} only"\
                             .format(split_column, list(df.columns)))
    unique_df_indices = set(data[0][split_column].unique())
    for df in data[1:]:
        if unique_df_indices != set(df[split_column].unique()):
            raise ValueError("all dataframes must have equal unique indices.")
    
    unique_df_indices = data[0][split_column].unique()
    if seed is not None:
        prng = rnd.RandomState(seed)
    else:
        prng = rnd.RandomState()
    
    train_mask = np.array(prng.binomial(1.0, train_rate, size=len(unique_df_indices)), dtype=np.bool)
    train_indices = unique_df_indices[train_mask]
    test_indices = unique_df_indices[~train_mask]
    train_data, test_data = (), ()
    for df in data:
        train_data += (df.loc[df[split_column].isin(train_indices)], )
        test_data += (df.loc[df[split_column].isin(test_indices)], )
    return (train_data, test_data) if multiple_df else (train_data[0], test_data[0])

def save_train_test(train_data, test_data, fnames, **csv_params):
    multiple_df = True
    if isinstance(train_data, pd.DataFrame):
        multiple_df = False
        train_data, test_data, fnames = (train_data,), (test_data,), (fnames,)
    names, exts = tuple(".".join(fname.split(".")[:-1]) for fname in fnames),\
                    tuple(fname.split(".")[-1] for fname in fnames)
    for train_df, test_df, name, ext in zip(train_data, test_data, names, exts):
        write_pandas_csv(train_df, fname=".".join([name, "train", ext]), **csv_params)
        write_pandas_csv(test_df, fname=".".join([name, "test", ext]), **csv_params)
    return None

def save_train_dev_test(train_data, dev_data, test_data, fnames, **csv_params):
    multiple_df = True
    if isinstance(train_data, pd.DataFrame):
        multiple_df = False
        train_data, dev_data, test_data, fnames = (train_data,), (dev_data,) (test_data,), (fnames,)
    names, exts = tuple(".".join(fname.split(".")[:-1]) for fname in fnames),\
                    tuple(fname.split(".")[-1] for fname in fnames)
    for train_df, dev_df, test_df, name, ext in zip(train_data, dev_data, test_data, names, exts):
        write_pandas_csv(train_df, fname=".".join([name, "train", ext]), **csv_params)
        write_pandas_csv(dev_df, fname=".".join([name, "dev", ext]), **csv_params)  
        write_pandas_csv(test_df, fname=".".join([name, "test", ext]), **csv_params)
    return None

def train_test_file_partition(fnames,
                              train_rate=TRAIN_RATE,
                              dev_rate=DEV_RATE,
                              split_column=SPLIT_COLUMN,
                              seed=DEBUG_SEED,
                              verbose=False,
                              **csv_params):
    if not csv_params:
        global DEFAULT_CSV_PARAMS
        csv_params = DEFAULT_CSV_PARAMS
    data = tuple(read_pandas_csv(fname, **DEFAULT_CSV_PARAMS) for fname in fnames)
    train_data, test_data = train_test_split(data, train_rate=train_rate, split_column=split_column, seed=seed)
    if dev_rate and dev_rate < 1.0 and dev_rate > 0:
        train_data, dev_data = train_test_split(train_data, train_rate = 1.0 - dev_rate, split_column=split_column, seed=seed)
        save_train_dev_test(train_data, dev_data, test_data, fnames, **csv_params)
        if verbose:
            for train_df, dev_df, test_df, fname in zip(train_data, dev_data, test_data, fnames):
                print("Splitting for file {}:".format(fname))
                for df, dname in zip([train_df, dev_df, test_df], ["train", "dev", "test"]):
                    print("{} contains {} records.".format(dname, len(df[split_column].unique())))
    else:
        save_train_test(train_data, test_data, fnames, **csv_params)
        if verbose:
            for train_df, test_df, fname in zip(train_data, test_data, fnames):
                print("Splitting for file {}:".format(fname))
                for df, dname in zip([train_df, test_df], ["train", "test"]):
                    print("{} contains {} records.".format(dname, len(df[split_column].unique())))
    return None

In [25]:
train_test_file_partition(FNAMES,
                          train_rate=TRAIN_RATE,
                          dev_rate=DEV_RATE,
                          split_column=SPLIT_COLUMN,
                          seed=DEBUG_SEED,
                          verbose=True)

Splitting for file Corpus-2015/Tokens.txt:
train contains 115 records.
dev contains 31 records.
test contains 35 records.
Splitting for file Corpus-2015/Groups.txt:
train contains 115 records.
dev contains 31 records.
test contains 35 records.


In [14]:
!head {file_path}

1	Во	во	S	S	Sp-a	9	обст	_	_
2	время	время	N	N	Ncnsan	1	предл	_	_
3	своих	свой	P	P	P---pga	4	опред	_	_
4	прогулок	прогулка	N	N	Ncfpgn	2	1-компл	_	_
5	в	в	S	S	Sp-l	4	атриб	_	_
6	окрестностях	окрестность	N	N	Ncfpln	5	предл	_	_
7	Симеиза	симеиз	N	N	Ncmsgn	6	квазиагент	_	_
8	я	я	P	P	P-1-snn	9	предик	_	_
9	обратил	обратить	V	V	Vmis-sma-p	0	ROOT	_	_
10	внимание	внимание	N	N	Ncnsan	9	1-компл	_	_


In [15]:
!head -n 100 {groups_path}

doc_id	variant	group_id	chain_id	link	shift	length	content	tk_shifts	attributes	head	hd_shifts
1	1	407840	1070	0	9	5	своих	9	ref:def|str:refl|type:coref		
1	1	407839	1070	407840	47	1	я	47	ref:def|str:pron|type:coref		
1	1	407842	1069	0	69	13	одинокую дачу	69,78	ref:def|str:noun|type:coref	дачу	78
1	1	407841	1069	407842	118	9	этой даче	118,123	ref:def|str:noun|type:coref	даче	123
1	1	407843	1069	407841	166	3	она	166	ref:def|str:pron|type:coref		
1	1	407846	1067	0	184	15	высоким забором	184,192	ref:def|str:noun|type:coref	забором	192
1	1	407844	1068	0	203	28	единственной низкой калиткой	203,216,223	ref:def|str:noun|type:coref	калиткой	223
1	1	407845	1068	407844	233	7	которая	233	ref:def|str:rel|type:coref		
1	1	407847	1067	407846	316	7	забором	316	ref:def|str:noun|type:coref		
1	1	407848	1069	407843	332	4	дачи	332	ref:def|str:noun|type:coref		
1	1	407975	1054	0	337	28	голые уступы желтоватых скал	337,343,350,361	ref:def|str:noun|type:coref	уступы	343
1	1	407850	10

In [16]:
!head -n 30 {tokens_path}

doc_id	shift	length	token	lemma	gram
1	0	2	Во	во	Sp-a
1	3	5	время	время	Ncnsan
1	9	5	своих	свой	P---pga
1	15	8	прогулок	прогулка	Ncfpgn
1	24	1	в	в	Sp-l
1	26	12	окрестностях	окрестность	Ncfpln
1	39	7	Симеиза	симеиза	Ncmsgn
1	47	1	я	я	P-1-snn
1	49	7	обратил	обратить	Vmis-sma-p
1	57	8	внимание	внимание	Ncnsan
1	66	2	на	на	Sp-a
1	69	8	одинокую	одинокий	Afpfsaf
1	78	4	дачу	дача	Ncfsan
1	82	1	,	,	,
1	84	8	стоявшую	стоять	Vmps-sfa-ea
1	93	2	на	на	Sp-l
1	96	6	крутом	крутой	Afpmslf
1	103	6	склоне	склон	Ncmsln
1	110	4	горы	гора	Ncfpan
1	114	1	.	.	SENT
1	116	1	К	к	Sp-l
1	118	4	этой	этот	P--fsla
1	123	4	даче	дача	Ncfsln
1	128	2	не	не	Q
1	131	4	было	быть	Vmis-sna-e
1	136	9	проведено	провести	Vmps-snpsp
1	146	4	даже	даже	Q
1	151	6	дороги	дорогой	Afpmpns
1	157	1	.	.	SENT


In [17]:
#!grep "\.\.\." {tokens_path}

In [9]:
def get_all_texts_from_tokens_file(tokens_path, out_path):
    text_count = 0
    lengths = {}
    texts = {}
    # determine number of texts and their lengths
    with open(tokens_path, "r") as tokens_file:
        header = tokens_file.readline()[:-1]
        for line in tokens_file:
            doc_id, shift, length, token, lemma, gram = line[:-1].split('\t')
            doc_id, shift, length = map(int, (doc_id, shift, length))
            lengths[doc_id] = shift + length
    text_count = len(lengths)
    
    texts = {doc_id: [' ']*length for (doc_id, length) in lengths.items()}
    # read texts
    with open(tokens_path, "r") as tokens_file:
        header = tokens_file.readline()[:-1]
        for line in tokens_file:
            doc_id, shift, length, token, lemma, gram = line[:-1].split('\t')
            doc_id, shift, length = map(int, (doc_id, shift, length))
            texts[doc_id][shift:shift + length] = token
    for doc_id in texts:
        texts[doc_id] = "".join(texts[doc_id])
    
    with open(out_path, "w") as out_file:
        for doc_id in texts:
            out_file.write(texts[doc_id])
            out_file.write("\n")
    return None

In [34]:
get_all_texts_from_tokens_file(tokens_path, text_out_path)

In [25]:
for fname, new_fname in zip(["Texts.txt", "lenta_texts.txt"], ["tokenized_rucor.txt", "tokenized_lenta.txt"]):

    with open(fname, "r") as f:
        with open(new_fname, "w") as nf:
            for line in f:
                newline = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-])\s*", r"\1 ", line)
                nf.write(newline.rstrip())
                nf.write("\n")

'- Здравствуйте, Елена Сергеевна!..    Старая учительница вздрогнула и подняла глаза. Перед нею стоял невысокий молодой человек. Он смотрел на нее весело и тревожно, и она, увидев это смешное мальчишеское выражение глаз, сразу узнала его.    - Дементьев, - сказала она радостно. - Ты ли это?    - Это я, - сказал человек, - можно сесть?    Она кивнула, и он уселся рядом с нею.    - Как же ты поживаешь, Дементьев, милый?    - Работаю, - сказал он, - в театре. Я актер. Актер на бытовые роли, то, что называется "характерный". А работаю много! Ну, а вы? Как вы-то поживаете?    - Я по-прежнему, - бодро сказала она, - прекрасно! Веду четвертый класс, есть просто удивительные ребята. Интересные, талантливые... Так что все великолепно!    Она помолчала и вдруг сказала упавшим голосом:    - Мне комнату новую дали... В двухкомнатной квартире... Просто рай...    Что-то в ее голосе насторожило Дементьева.    - Как вы это странно произнесли, Елена Сергеевна, - сказал он, - невесело как-то... Что, мал

'abra'

In [4]:
class Mention():
    def __init__(self):
        self.raw_string = None
        self.doc_id = None
        self.variant = None
        self.group_id = None
        self.chain_id = None
        self.link = None
        self.shift = None
        self.length = None
        self.raw_content = None
        self.content = None
        self.tk_shifts = None
        self.attributes = None
        self.is_head = None
        self.raw_head = None
        self.head = None
        self.head_length = None
        self.head_shift = None
        self.head_shifts = None
        
        self.sentence = None
        self.sentence_shifts = None
        self.head_sentence = None
        self.head_sentence_shifts = None
        
    @staticmethod
    def pair_feature_names():
        return ["exact-string-match"]
        
    @staticmethod
    def parse(data):
        return Mention()._parse(data)
    
    def _parse(self, data):
        self.raw_string = data if data[-1] != "\n" else data[:-1]
        splitted_data = data.split('\t')
        always_numeric_data = splitted_data[:7]
        self.content, self.tk_shifts, self.attributes, self.head, self.head_shifts = splitted_data[7:]
        self.doc_id, self.variant, self.group_id, self.chain_id, self.link, self.shift, self.length = map(int, always_numeric_data)
        
        mention_len = len(self.tk_shifts.split(","))
        self.is_head = (self.head != "")
        multiple_mention = mention_len > 1
        multiple_head = self.is_head and len(self.head_shifts.split(",")) > 1

        self.raw_content = self.content
        self.raw_head = self.head
        length = len(self.raw_content)
        head_length = len(self.raw_head)
        if multiple_mention:
            self.tk_shifts = list(map(int, self.tk_shifts.split(',')))
            zero_shifts = list(map(lambda x: x - self.shift, self.tk_shifts)) + [length]
            self.content = [self.content[sft:sft_next].split(" ")[0] for (sft, sft_next) in zip(zero_shifts[:-1], zero_shifts[1:])]
        else:
            self.content = [self.content]
            self.tk_shifts = [int(self.tk_shifts)]

        if multiple_head:
            self.head_shifts = list(map(int, self.head_shifts.split(',')))
            zero_shifts = list(map(lambda x: x - self.head_shifts[0], self.head_shifts)) + [len(self.head)]
            self.head = [self.head[sft:sft_next].split(" ")[0] for (sft, sft_next) in zip(zero_shifts[:-1], zero_shifts[1:])]
        elif self.is_head:
            self.head_shifts = [int(self.head_shifts)]
            self.head_shift = self.head_shifts[0]
            self.head = [self.head]
        else:
            self.head_shifts = self.tk_shifts
            self.head = self.content
            self.raw_head = self.raw_content
            self.head_length = self.length
            self.head_shift = self.head_shifts[0]
        return self
    
    def get_pairwise_features(self, other):
        return [self.exact_string_match_ohe(other)]
    
    def exact_string_match_ohe(self, other):
        return 1 if self.raw_content == other.raw_content else 0
    
    def relaxed_repr(self):
        if not self.is_head:
            rel_repr = self.raw_content
        else:
            rel_repr = self.raw_content[:self.head_shifts[0] - self.tk_shifts[0] + self.head_length]
        return rel_repr
               
    def relaxed_head_match(self, other):
        return self.relaxed_repr() == other.relaxed_repr()
    
    def length_in_tokens(self):
        return len(self.tk_shifts)
    
    def head_length_in_tokens(self):
        return len(self.tk_shifts) if not self.is_head else len(self.head_shifts)
    
    def absorb_sentence_and_shifts(self, sentence, sentence_shifts):
        self.sentence = sentence
        self.sentence_shifts = sentence_shifts
    
    def absorb_head_sentence_and_shifts(self, sentence, sentence_shifts):
        self.head_sentence = sentence
        self.head_sentence_shifts = sentence_shifts
    
    def start_index(self):
        token_shift = self.tk_shifts[0]
        if token_shift > self.sentence_shifts[-1]:
            return -1
        else:
            for i, sentence_shift in enumerate(self.sentence_shifts):
                if sentence_shift == token_shift:
                    assert(self.sentence[i] == self.content[0])
                    return i
        raise ValueError("Inside Mention::start_index: Wrong shift {} for sentence with shifts {}".format(token_shift, self.sentence_shifts))
    
    def head_consistent(self):
        if not all([(ss == hss) for (ss, hss) in zip(self.sentence_shifts, self.head_sentence_shifts)]):
            return False
        else:
            return True
    
    def end_index(self):
        start_index = self.start_index()
        return start_index + len(self.content) if start_index != -1 else -1
    
    def head_index(self):
        token_shift = self.head_shifts[0]
        if token_shift > self.sentence_shifts[-1]:
            return -1
        else:
            for i, sentence_shift in enumerate(self.sentence_shifts):
                if sentence_shift == token_shift:
                    assert(self.sentence[i] == self.head[0])
                    return i
        raise ValueError("Inside Mention::start_index: Wrong shift {} for sentence with shifts {}"\
                         .format(token_shift, self.sentence_shifts))

    def __repr__(self):
        return self.raw_string

In [5]:
class DataExporter():
    def __init__(self, path, out_path, dataset_type="train"):
        self.data = defaultdict(lambda: {
                "shift_to_form": {},
                "shift_to_lemma": {},
                "sentences": [],
                "sentences_shifts": [],
                "document_features": {},
                "shift_to_sentence_index": {},
                "shift_to_token_index_in_sentence": {},
                "mentions": {}
            })
        self.temp_data = None
        self.doc_count = None
        self.chains = None
        self.path = path
        self.out_path = out_path
        self.gold_dir = "gold"
        self.data_raw_dir = "data_raw"
        self.tokens_ext = "txt"
        self.groups_ext = "txt"
        self.tokens_fname = "Tokens"
        self.groups_fname = "Groups"
        self.dataset_type = dataset_type
        self.scope_trees = defaultdict(lambda: intervaltree.IntervalTree())
        
        self.tokens_proceeded = False
        self.groups_proceeded = False
    
    def process_tokens(self):
        self.temp_data = defaultdict(lambda: {
            "is_prev_sent": True  #info for each line if there was "SENT" (sentence end) token before it
        })
        
        self.doc_count = 0
        
        tokens_path = os.path.join(self.path, ".".join([self.tokens_fname, self.dataset_type, self.tokens_ext]))
        with open(tokens_path, "r") as tokens_file:
            header = next(tokens_file)[:-1]
            for line in tokens_file:
                doc_id, shift, length, token, lemma, gram = line[:-1].split('\t')
                doc_id, shift, length = map(int, (doc_id, shift, length))
                
                doc_data = self.data[doc_id]
                temp_doc_data = self.temp_data[doc_id]
                
                
                if not "doc_id" in doc_data["document_features"]:
                    doc_data["document_features"]["doc_id"] = doc_id
                    # DO SOMETHING WITH THIS HARDCODE
                    doc_data["document_features"]["source"] = "bc"
                    doc_data["document_features"]["type"] = 0
                
                doc_data["shift_to_form"][shift] = token
                doc_data["shift_to_lemma"][shift] = lemma
                sentences = doc_data["sentences"]
                sentences_shifts = doc_data["sentences_shifts"]

                if temp_doc_data["is_prev_sent"] is True:
                    sentence = []
                    sentence_shifts = []
                    sentences.append(sentence)
                    sentences_shifts.append(sentence_shifts)
                else:
                    sentence = sentences[-1]
                    sentence_shifts = sentences_shifts[-1]
                doc_data["shift_to_sentence_index"][shift] = len(sentences) - 1
                doc_data["shift_to_token_index_in_sentence"][shift] = len(sentence)
                sentence.append(token)
                sentence_shifts.append(shift)

                temp_doc_data["is_prev_sent"] = True if gram == "SENT" else False
        del self.temp_data
        self.temp_data = None
        self.tokens_proceeded = True
        return self
        
    def get_sentence_index_by_token_shift(self, doc_id, shift):
        return self.data[doc_id]["shift_to_sentence_index"][shift]
    
    def get_sentence_by_token_shift(self, doc_id, shift):
        return self.data[doc_id]["sentences"][self.get_sentence_index_by_token_shift(doc_id, shift)]
    
    def get_sentence_shifts_by_token_shift(self, doc_id, shift):
        return self.data[doc_id]["sentences_shifts"][self.get_sentence_index_by_token_shift(doc_id, shift)]
    
    def get_mention_sentence(self, mention):
        return self.get_sentence_by_token_shift(mention.doc_id, mention.shift)

    def get_mention_sentence_shifts(self, mention):
        return self.get_sentence_shifts_by_token_shift(mention.doc_id, mention.shift)
    
    def get_mention_head_sentence(self, mention):
        return self.get_sentence_by_token_shift(mention.doc_id, mention.head_shifts[0])

    def get_mention_head_sentence_shifts(self, mention):
        return self.get_sentence_shifts_by_token_shift(mention.doc_id, mention.head_shifts[0])
    
    def get_mention_sentence_info(self, mention):
        return (self.get_mention_sentence(mention), self.get_mention_sentence_shifts(mention))
    
    def get_mention_head_sentence_info(self, mention):
        return (self.get_mention_head_sentence(mention), self.get_mention_head_sentence_shifts(mention))
    
    def alert_merge_two_sentences(self, inconsistent_mention):
        doc_dict = self.data[inconsistent_mention.doc_id]
        while not inconsistent_mention.head_consistent:
            sentences, sentences_shifts = doc_dict["sentences"], doc_dict["sentences_shifts"]
            sent_index, alert_sent_index = self.get_sentence_index_by_token_shift(inconsistent_mention.shift),\
                                            self.get_sentence_index_by_token_shift(inconsistent_mention.head_shift)
            assert(sent_index < alert_sent_index)
            sentences = sentences[:sent_index] \
                        + [sum(sentences[sent_index:alert_sent_index + 1], [])]\
                        + sentences[sent_index + 1:alert_sent_index]\
                        + sentences[alert_sent_index + 1:]
            sentences_shifts = sentences_shifts[:sent_index] \
                        + [sum(sentences_shifts[sent_index:alert_sent_index + 1], [])]\
                        + sentences_shifts[sent_index + 1:alert_sent_index]\
                        + sentences_shifts[alert_sent_index + 1:]
            print("merged from {} to {} sentences".format(sent_index, alert_sent_index))
            doc_dict["sentences"], doc_dict["sentences_shifts"] = sentences, sentences_shifts
            inconsistent_mention.absorb_sentence_and_shifts(self.get_mention_sentence_info(inconsistent_mention))
            inconsistent_mention.absorb_head_sentence_and_shifts(self.get_mention_head_sentence_info(inconsistent_mention))
        return
    
    def process_groups(self):
        self.temp_data = defaultdict(lambda: {
                "mention_counter": 0,
                "chain_id": {},
                "mention_by_num": {}
            })
        self.chains = defaultdict(lambda: defaultdict(lambda: []))

        groups_path = os.path.join(self.path, ".".join([self.groups_fname, self.dataset_type, self.groups_ext]))
        
        unique_mention_types = set()

        with open(groups_path, "r") as groups_file:
            header = groups_file.readline()[:-1]
            for line in groups_file:
                mention = Mention.parse(line)

                doc_data = self.data[mention.doc_id]
                temp_doc_data = self.temp_data[mention.doc_id]

                mention_counter = temp_doc_data["mention_counter"]
                temp_doc_data["mention_by_num"][mention_counter] = mention
                mention.absorb_sentence_and_shifts(*self.get_mention_sentence_info(mention))
                mention.absorb_head_sentence_and_shifts(*self.get_mention_head_sentence_info(mention))
                
                if not mention.head_consistent():
                    print("Warning: mention {}\n is head inconsistent".format(mention))
                    self.alert_merge_two_sentences(mention)
                head_index = mention.head_index()
                
                mention_type = mention.attributes.split("|")[1].split(":")[-1] if mention.attributes else "undef"
                mention_type = "undef" if mention_type == "" else mention_type
                
                self.scope_trees[mention.doc_id][mention.shift:mention.shift + mention.length] = mention
                
                doc_data["mentions"][str(mention_counter)] = {
                    "doc_id": mention.doc_id,
                    "mention_id": mention_counter,
                    "mention_num": mention_counter,
                    "start_index": mention.start_index(),
                    "end_index": mention.end_index(),
                    "head_index": mention.head_index(),
                    "mention_type": mention_type,
                    "sent_num": self.get_sentence_index_by_token_shift(mention.doc_id, mention.shift),
                    "sentence": mention.sentence
                }                
                
                self.chains[str(mention.doc_id)][mention.chain_id].append(mention_counter)
                unique_mention_types.add(mention_type)
                
                temp_doc_data["mention_counter"] += 1

        for doc_id, doc_data in self.data.items():
            temp_doc_data = self.temp_data[doc_id]
            mention_indices = sorted([val["mention_num"] for val in  doc_data["mentions"].values()])
            labels = {}
            pair_feature_names = Mention.pair_feature_names()
            pair_features = {}
            for mid in mention_indices:
                mention = temp_doc_data["mention_by_num"][mid]
                intervals = self.scope_trees[doc_id][mention.shift:mention.shift + mention.length]
                containment_flag = False
                for interval in intervals:
                    if (interval[0] <= mention.shift and interval[1] > mention.shift + mention.length) \
                            or (interval[0] < mention.shift and interval[1] >= mention.shift + mention.length):
                        containment_flag = True
                
                doc_data["mentions"][str(mid)]["contained-in-other-mention"] = 1 if containment_flag else 0                        
            
            for ind1 in range(len(mention_indices)):
                for ind2 in range(ind1 + 1, len(mention_indices)):
                    mid1, mid2 = mention_indices[ind1], mention_indices[ind2]
                    mention1, mention2 = temp_doc_data["mention_by_num"][mid1], temp_doc_data["mention_by_num"][mid2]
                    labels["{} {}".format(mid1, mid2)] = 1 if mention1.chain_id == mention2.chain_id else 0
                    pair_features["{} {}".format(mid1, mid2)] = mention1.get_pairwise_features(mention2)
            doc_data["labels"] = labels
            doc_data["pair_features"] = pair_features
            doc_data["pair_feature_names"] = pair_feature_names
            
        del self.temp_data
        
        allowed_keys = ["sentences", "mentions", "document_features", "labels", "pair_feature_names", "pair_features"]
        for doc_id in self.data:
            self.data[doc_id] = {key: self.data[doc_id][key] for key in allowed_keys}
        self.groups_proceeded = True
        print("Uniq mention types: {}".format(unique_mention_types))
        return self
    
    def dump(self):
        gold_dir_path = os.path.join(self.out_path, self.gold_dir)
        if not os.path.exists(gold_dir_path):
            os.makedirs(gold_dir_path)
        with open(os.path.join(gold_dir_path, self.dataset_type), 'w') as gold:
            for str_doc_id in sorted(self.chains, key=int):
                json.dump({str_doc_id: [self.chains[str_doc_id][group_id] for group_id in self.chains[str_doc_id]]}, gold)
                gold.write("\n")
        
        data_raw_dir_path = os.path.join(self.out_path, self.data_raw_dir)
        if not os.path.exists(data_raw_dir_path):
            os.makedirs(data_raw_dir_path)
        with open(os.path.join(data_raw_dir_path, self.dataset_type), 'w') as data_raw:
            for doc_id in sorted(self.data, key=int):
                json.dump(self.data[doc_id], data_raw)
                data_raw.write("\n")
        return None
    
    def __repr__(self):
        return "Type: {}\nTokens proceeded: {}\nGroups proceeded: {}".format(self.dataset_type, self.tokens_proceeded, self.groups_proceeded)

In [6]:
class ExportManager():
    def __init__(self, path, out_path, dataset_types, verbose=True):
        self.dataset_types = dataset_types
        self.path = path
        self.out_path=out_path
        self.exporters = []
        self.verbose = verbose

        self.initialize()
    
    def initialize(self):
        for dataset_type in self.dataset_types:
            self.exporters.append(DataExporter(self.path, self.out_path, dataset_type))

    def process(self):
        for exporter in self.exporters:
            exporter.process_tokens().process_groups()
            if self.verbose:
                print("next exporter is done.")
                print(exporter)
                print("-"*80)
            
    def export(self):
        for exporter in self.exporters:
            exporter.dump()

In [142]:
export_manager = ExportManager(path="Corpus-2015/", out_path="Corpus-2015-formatted/", dataset_types=["train", "dev", "test"])

In [143]:
export_manager.process()

 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
Uniq mention types: {'poss', 'pron', 'undef', 'rel', 'noun', 'refl', 'dem', 'appo'}
next exporter is done.
Type: train
Tokens proceeded: True
Groups proceeded: True
--------------------------------------------------------------------------------
Uniq mention types: {'poss', 'pron', 'undef', 'rel', 'noun', 'refl', 'dem', 'def', 'appo'}
next exporter is done.
Type: dev
Tokens proceeded: True
Groups proceeded: True
--------------------------------------------------------------------------------
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
Uniq mention types: {'poss', 'pron', 'rel

In [144]:
export_manager.export()

In [133]:
!head -n 1 Corpus-2015-formatted/data_raw/dev

{"mentions": {"12": {"sentence": ["\u041e\u043d", "\u043e\u0433\u043b\u044f\u043d\u0443\u043b\u0441\u044f", "\u0438", "\u0437\u0430\u043c\u0435\u0440", "\u0441", "\u043b\u0430\u043f\u043e\u0439", ",", "\u0437\u0430\u043b\u043e\u0436\u0435\u043d\u043d\u043e\u0439", "\u0437\u0430", "\u0443\u0445\u043e", "."], "start_index": 9, "mention_type": "noun", "doc_id": 3, "sent_num": 5, "mention_num": 12, "contained-in-other-mention": 0, "head_index": 9, "end_index": 10}, "60": {"sentence": ["\u041f\u0435\u0442\u0443\u0445", "\u043f\u043e\u0431\u0435\u0434\u043d\u043e", "\u0437\u0430\u0445\u043b\u043e\u043f\u0430\u043b", "\u043a\u0440\u044b\u043b\u044c\u044f\u043c\u0438", ",", "\u043f\u043e\u0434\u043d\u044f\u043b", "\u0433\u0443\u0441\u0442\u0443\u044e", "\u043f\u044b\u043b\u044c", ",", "\u043a\u043b\u044e\u043d\u0443\u043b", "\u0440\u0430\u0437\u043c\u043e\u043a\u0448\u0443\u044e", "\u043a\u043e\u0440\u043a\u0443", "\u0438", "\u0441", "\u043e\u0442\u0432\u0440\u0430\u0449\u0435\u043d\u0438\u043

In [20]:
exporter = DataExporter(tokens_path, groups_path)

In [21]:
exporter.process_tokens().process_groups()

 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent
 is head inconsistent


Tokens proceeded: True
Groups proceeded: True

In [22]:
exporter.dump(path_data_raw, path_gold, formatted_file_fname)

In [None]:
file_path = 'Corpus-2015/parsed_testset/fiction/102_beliajev_nad_bezdnoj.conll'
groups_path = 'Corpus-2015/Groups.txt'
tokens_path = 'Corpus-2015/Tokens.txt'

text_out_path = 'Corpus-2015/Texts.txt'

path_data_raw = 'Corpus-2015/formatted_data/data_raw'
path_gold = 'Corpus-2015/formatted_data/gold'
formatted_file_fname = "train"

DEBUG_SEED = 42

FNAMES = ["Corpus-2015/Tokens.txt", "Corpus-2015/Groups.txt"]
TRAIN_RATE = 0.8
DEV_RATE = 0.2
SPLIT_COLUMN = "doc_id"





class KFolder():
    def __init__(self, run_path, init_data_path, split_column="doc_id", train_rate=0.8, dev_rate=0.2, n_folds=10, seeds=None):
        # constants
        self.RUN_DIR = "run"
        self.DATA_DIR = "run_data"
        self.CSV_PARAMS = {
            "sep": '\t',
            "quoting": csv.QUOTE_NONE
        }
        self.TOKENS_FNAME = "Tokens.txt"
        self.GROUPS_FNAME = "Groups.txt"
        self.FNAMES = [self.TOKENS_FNAME, self.GROUPS_FNAME]
        
        self.FOLD_DIR_MASK = "fold"
        
        # args
        self.run_path = run_path
        self.init_data_path = init_data_path
        self.split_column = split_column
        self.train_rate = train_rate
        self.dev_rate = dev_rate
        self.n_folds = n_folds
        self.seeds = seeds if seeds is not None else [None]*n_folds
        
        self.data_save_mask = os.path.join(run_path, self.DATA_DIR)
        self.code_run_mask = os.path.join(run_path, self.RUN_DIR)
        for i in range(n_folds):
            os.mkdirs(os.path.join(self.data_save_mask, (self.FOLD_DIR_MASK + str(i))))
            os.mkdirs(os.path.join(self.code_run_mask, (self.FOLD_DIR_MASK + str(i))))

        
        # initialization
        self.data = None
        
        self.load_data()
        
    def data_fold_path(self, i):
        return os.path.join(self.data_save_mask, (self.FOLD_DIR_MASK + str(i)))
    
    def run_fold_path(self, i):
        return os.path.join(self.code_run_mask, (self.FOLD_DIR_MASK + str(i)))

        
    def read_pandas_csv(self, fname):
        return pd.read_csv(fname, **self.CSV_PARAMS)


    def write_pandas_csv(self, df, fname):
        df.to_csv(fname, index=False, **self.CSV_PARAMS)
        return None
    
    def train_test_split(self, data, train_rate, seed=None):
        multiple_df = True
        if train_rate <= 0.0 or train_rate >= 1.0:
            raise ValueError("train_rate must lie in (0, 1), but got {}".format(train_rate))
        if isinstance(data, pd.DataFrame):
            multiple_df = False
            data = (data,)
        for df in data:
            if self.split_column not in df.columns:
                raise ValueError("split_column {} must be valid column in dataframe, but got {} only"\
                                 .format(split_column, list(df.columns)))
        unique_df_indices = set(data[0][split_column].unique())
        for df in data[1:]:
            if unique_df_indices != set(df[split_column].unique()):
                raise ValueError("all dataframes must have equal unique indices.")

        unique_df_indices = data[0][split_column].unique()
        if seed is not None:
            prng = rnd.RandomState(seed)
        else:
            prng = rnd.RandomState()

        train_mask = np.array(prng.binomial(1.0, train_rate, size=len(unique_df_indices)), dtype=np.bool)
        train_indices = unique_df_indices[train_mask]
        test_indices = unique_df_indices[~train_mask]
        train_data, test_data = (), ()
        for df in data:
            train_data += (df.loc[df[split_column].isin(train_indices)], )
            test_data += (df.loc[df[split_column].isin(test_indices)], )
        return (train_data, test_data) if multiple_df else (train_data[0], test_data[0])

    def save_train_test(self, train_data, test_data, fold_num):
        fnames = self.FNAMES
        multiple_df = True
        if isinstance(train_data, pd.DataFrame):
            multiple_df = False
            train_data, test_data, fnames = (train_data,), (test_data,), (fnames,)
        names, exts = tuple(".".join(fname.split(".")[:-1]) for fname in fnames),\
                        tuple(fname.split(".")[-1] for fname in fnames)
        for train_df, test_df, name, ext in zip(train_data, test_data, names, exts):
            write_pandas_csv(train_df, fname=".".join([self.data_fold_path(fold_num), name, "train", ext]), **self.CSV_PARAMS)
            write_pandas_csv(test_df, fname=".".join([self.data_fold_path(fold_num), name, "test", ext]), **self.CSV_PARAMS)
        return None

    def save_train_dev_test(self, train_data, dev_data, test_data, fold_num):
        fnames = self.FNAMES
        multiple_df = True
        if isinstance(train_data, pd.DataFrame):
            multiple_df = False
            train_data, dev_data, test_data, fnames = (train_data,), (dev_data,) (test_data,), (fnames,)
        names, exts = tuple(".".join(fname.split(".")[:-1]) for fname in fnames),\
                        tuple(fname.split(".")[-1] for fname in fnames)
        for train_df, dev_df, test_df, name, ext in zip(train_data, dev_data, test_data, names, exts):
            write_pandas_csv(train_df, fname=".".join([self.data_fold_path(fold_num), name, "train", ext]), **self.CSV_PARAMS)
            write_pandas_csv(dev_df, fname=".".join([self.data_fold_path(fold_num), name, "dev", ext]), **self.CSV_PARAMS)  
            write_pandas_csv(test_df, fname=".".join([self.data_fold_path(fold_num), name, "test", ext]), **self.CSV_PARAMS)
        return None
    
    def train_test_file_partition(self, fold_num, verbose=False):
        fnames = self.FNAMES
        data = tuple(read_pandas_csv(os.path.join(init_data_path, fname)) for fname in fnames)
        train_data, test_data = self.train_test_split(data,
                                                      train_rate=self.train_rate,
                                                      split_column=self.split_column,
                                                      seed=self.seeds[fold_num])
        if dev_rate and dev_rate < 1.0 and dev_rate > 0:
            train_data, dev_data = self.train_test_split(train_data,
                                                         train_rate = 1.0 - self.dev_rate,
                                                         split_column=self.split_column,
                                                         seed=seeds[fold_num])
            self.save_train_dev_test(train_data, dev_data, test_data, fold_num)
            if verbose:
                for train_df, dev_df, test_df, fname in zip(train_data, dev_data, test_data, fnames):
                    print("Fold {}. Splitting for file {}:".format(fold_num, fname))
                    for df, dname in zip([train_df, dev_df, test_df], ["train", "dev", "test"]):
                        print("{} contains {} records.".format(dname, len(df[split_column].unique())))
        else:
            self.save_train_test(train_data, test_data, fold_num)
            if verbose:
                for train_df, test_df, fname in zip(train_data, test_data, fnames):
                    print("Fold {}. Splitting for file {}:".format(fold_num, fname))
                    for df, dname in zip([train_df, test_df], ["train", "test"]):
                        print("{} contains {} records.".format(dname, len(df[split_column].unique())))
        return None

    def code_setup(self, fold_num):
        current_code_path = os.path.join(self.code_run_mask, (self.FOLD_DIR_MASK + str(i)))
        !git clone https://github.com/Nehoroshiy/deep-coref.git {current_code_path}
        current_code_path = os.path.join(current_code_path, "deep-coref")
        !mkdir {os.path.join(current_code_path)}
        prepared_data_path
        
    
    def process_data(self):
        for fold_num in range(self.n_folds):
            self.train_test_file_partition(fold_num, verbose=True)

    def process_code(self):
        for fold_num in range(self.n_folds):
            