In [110]:
# coreference model import and inference

from pathlib import Path
import os
from tibert import BertForCoreferenceResolution, predict_coref
from tibert.utils import pprint_coreference_document
from transformers import BertTokenizerFast

import pickle

import pandas as pd



In [114]:
class ProcessFile():
    def __init__(self):
        self.sentence_type = ["subject-m-object", "subject-e-subject", "subject-e-object"]
        
        self.command_type = ["OC-OP", "OC-XP", "XC-OP", "XC-XP"]

        default_path = str(Path("./").resolve()) + os.sep + "syntax_dataset_revised/"
        
        self.tokenx_types = ["a", "the", "john", "he", "every", "some"]
        self.tokeny_types = ["a", "the", "john", "he"]
        self.embed_types = ["RY", "EY"]

        self.sent_combination = [f"RX-{etype}-{xtype}-{ytype}.txt" for etype in self.embed_types for xtype in self.tokenx_types for ytype in self.tokeny_types]

    def set_files(self, df1, df2, df3):
        self.df1 = df1
        self.df2 = df2
        self.df3 = df3

    def three_at_once(self, func):
        return func(self.df1), func(self.df2), func(self.df3)


class Processor():
    def __init__(self, base,  file_types, model, tokenizer):
        self.base = base
        self.file_types = file_types
        self.model = model
        self.tokenizer = tokenizer
        pass
    
    def process_each_sent_combination_in_a_type(self, sentence_type, embed_type):

        lst = []
        folder_url = self.base + sentence_type + os.sep + embed_type
        for combination in self.file_types:
            combination_text = "-".join(combination.split(".")[0].split("-")[-3:])
            file_url = self.base + sentence_type + os.sep + embed_type + os.sep + "OC-OP-" + combination

            with open(file_url, "r") as f: 
                content = [i.strip().split("\t") for i in f.readlines()]
            text = [i[0] for i in content]
            tuples = [(i[1], i[2]) for i in content]
                
            coref_out = predict_coref(text, model = self.model ,tokenizer = self.tokenizer, batch_size = 128)
            coref_tokens = []
            for i in range(len(coref_out)):
                coref_tokens.append([[" ".join(i.tokens) for i in coref_out[i].coref_chains[j]] for j in range(len(coref_out[i].coref_chains))])

            really_corefed = []
            for idx, x in enumerate(zip(coref_tokens, tuples)):
                the_corefs, the_tuple = x
                really_corefed.append(any([all([the_token in sub_corefs for the_token in the_tuple]) for sub_corefs in the_corefs]))

            lst.append((sentence_type, embed_type, combination_text, round(sum(really_corefed) / len(really_corefed), 4)))

        return lst


def main(base, model, tokenizer):
    
    files = ProcessFile()
    processor = Processor(base, files.sent_combination, model, tokenizer)
    # result = {"subject-m-object": {}, "subject-e-subject": {}, "subject-e-object": {}}
    
    # result['subject-m-object']['OC-OP'] = processor.process_each_sent_combination_in_a_type("subject-m-object", "OC-OP")
    # result['subject-m-object']['OC-XP'] = processor.process_each_sent_combination_in_a_type("subject-m-object", "OC-XP")
    # result['subject-m-object']['XC-OP'] = processor.process_each_sent_combination_in_a_type("subject-m-object", "XC-OP")
    # result['subject-m-object']['XC-XP'] = processor.process_each_sent_combination_in_a_type("subject-m-object", "XC-XP")

    # result['subject-e-object']['OC-OP'] = processor.process_each_sent_combination_in_a_type("subject-e-object", "OC-OP")
    # result['subject-e-object']['OC-XP'] = processor.process_each_sent_combination_in_a_type("subject-e-object", "OC-XP")
    # result['subject-e-object']['XC-OP'] = processor.process_each_sent_combination_in_a_type("subject-e-object", "XC-OP")
    # result['subject-e-object']['XC-XP'] = processor.process_each_sent_combination_in_a_type("subject-e-object", "XC-XP")

    # result['subject-e-subject']['OC-OP'] = processor.process_each_sent_combination_in_a_type("subject-e-subject", "OC-OP")
    # result['subject-e-subject']['OC-XP'] = processor.process_each_sent_combination_in_a_type("subject-e-subject", "OC-XP")
    # result['subject-e-subject']['XC-OP'] = processor.process_each_sent_combination_in_a_type("subject-e-subject", "XC-OP")
    # result['subject-e-subject']['XC-XP'] = processor.process_each_sent_combination_in_a_type("subject-e-subject", "XC-XP")

    
    # with open("/home/hyohyeongjang/syntax_finalterm/data.pickle", "wb") as f:
    #     pickle.dump(result, f)


    #######################################################################

    with open("/home/hyohyeongjang/syntax_finalterm/data.pickle", "rb") as f:
        result = pickle.load(f)

    import pandas as pd
    for i in ["subject-m-object", "subject-e-subject", "subject-e-object"]:
        x1 = pd.DataFrame(result[i]['OC-OP'])[[1,2,3]]
        x2 = pd.DataFrame(result[i]['OC-XP'])[[1,2,3]]
        x3 = pd.DataFrame(result[i]['XC-OP'])[[1,2,3]]
        x4 = pd.DataFrame(result[i]['XC-XP'])[[1,2,3]]

        x = pd.concat([x1,x2,x3,x4], axis = 0).groupby([1,2]).mean().unstack().T
        x.to_csv(i+".csv", index = True)



    base = "/home/hyohyeongjang/syntax_finalterm/"

    smo = pd.read_csv(f"{base}subject-m-object.csv").drop(columns="Unnamed: 0")
    col = pd.DataFrame(smo['2'].map(lambda x: x.split("-")).values.tolist())
    col.columns = ['embeeding', "subj_quant", "obj_quant"]
    smo = pd.concat([col, smo], axis = 1).drop(columns = "2")

    ses = pd.read_csv(f"{base}subject-e-subject.csv").drop(columns="Unnamed: 0")
    col = pd.DataFrame(ses['2'].map(lambda x: x.split("-")).values.tolist())
    col.columns = ['embeeding', "subj_quant", "obj_quant"]
    ses = pd.concat([col, ses], axis = 1).drop(columns = "2")

    seo = pd.read_csv(f"{base}subject-e-object.csv").drop(columns="Unnamed: 0")
    col = pd.DataFrame(seo['2'].map(lambda x: x.split("-")).values.tolist())
    col.columns = ['embeeding', "subj_quant", "obj_quant"]
    seo = pd.concat([col, seo], axis = 1).drop(columns = "2")


    # smo = smo.loc[(smo[files.command_type] > 0.5).apply(lambda x: all(x), axis = 1)]
    # ses = ses.loc[(ses[files.command_type] > 0.5).apply(lambda x: all(x), axis = 1)]
    # seo = seo.loc[(seo[files.command_type] > 0.5).apply(lambda x: all(x), axis = 1)]
    def clip_exp_exp(df):
        return df.loc[df['subj_quant'].map(lambda x: x in ['john', 'he']) & df['obj_quant'].map(lambda x: x in ['john', 'he'])].sort_values(by = ["subj_quant", "obj_quant", 'embeeding'])    
    
    def clip_quant_exp(df):
        return df.loc[df['subj_quant'].map(lambda x: x in ['a', 'the', 'every', 'some']) & df['obj_quant'].map(lambda x: x in ['he', 'john'])].sort_values(by = ["subj_quant", "obj_quant", 'embeeding'])    
    
    def clip_exp_quant(df):
        return df.loc[df['subj_quant'].map(lambda x: x in ['he', 'john']) & df['obj_quant'].map(lambda x: x in ['a', 'the'])].sort_values(by = ["subj_quant", "obj_quant", 'embeeding'])

    def clip_quant_quant(df):
        return df.loc[df['subj_quant'].map(lambda x: x in ['a', 'the','every','some']) & df['obj_quant'].map(lambda x: x in ['a', 'the'])].sort_values(by = ["subj_quant", "obj_quant", 'embeeding'])    
    

    files.set_files(smo, ses, seo)
    return (files.three_at_once(clip_exp_exp),
           files.three_at_once(clip_quant_exp),
           files.three_at_once(clip_exp_quant),
           files.three_at_once(clip_quant_quant))
    
    

    
    

In [115]:
%%time 

if __name__ == "__main__":
    base = "/home/hyohyeongjang/syntax_finalterm/syntax_dataset_revised/"
    model_checkpoint = "compnet-renard/bert-base-cased-literary-coref"
    coref_model = BertForCoreferenceResolution.from_pretrained(model_checkpoint)
    tokenizer_checkpoint = "bert-base-cased"
    coref_tokenizer = BertTokenizerFast.from_pretrained(tokenizer_checkpoint)

    
    exp_exp, quant_exp, exp_quant, quant_quant = main(base, coref_model, coref_tokenizer)
    




loading configuration file config.json from cache at /home/hyohyeongjang/.cache/huggingface/hub/models--compnet-renard--bert-base-cased-literary-coref/snapshots/5d70bdeda174f8e3dde073b2d5c0f528611ca135/config.json
Model config BertForCoreferenceResolutionConfig {
  "_name_or_path": "bert-base-cased",
  "antecedents_nb": 350,
  "architectures": [
    "BertForCoreferenceResolution"
  ],
  "attention_probs_dropout_prob": 0.3,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.3,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "max_span_size": 7,
  "mention_loss_coeff": 0.1,
  "mention_scorer_dropout": 0.1,
  "mention_scorer_hidden_size": 3000,
  "mentions_per_token": 0.4,
  "mentions_per_tokens": 0.4,
  "metadatas_features_size": 20,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0

CPU times: user 846 ms, sys: 422 ms, total: 1.27 s
Wall time: 1.77 s


In [131]:
quant_quant[1
            ]

Unnamed: 0,embeeding,subj_quant,obj_quant,OC-OP,OC-XP,XC-OP,XC-XP
0,EY,a,a,0.3,0.1,0.16,0.3
24,RY,a,a,0.21,0.08,0.1,0.31
3,EY,a,the,0.47,0.18,0.44,0.73
27,RY,a,the,0.53,0.31,0.3,0.61
4,EY,every,a,0.0,0.0,0.0,0.14
28,RY,every,a,0.0,0.0,0.0,0.03
7,EY,every,the,0.0,0.04,0.01,0.42
31,RY,every,the,0.0,0.03,0.0,0.1
16,EY,some,a,0.07,0.0,0.0,0.27
40,RY,some,a,0.0,0.01,0.0,0.25


In [113]:
!pwd

/home/hyohyeongjang/dependency_bert


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
