# Third dataset to look at:
* ANCORE (coreference in French)
* Task description and data indications available at https://hal.archives-ouvertes.fr/hal-01075679/document

Note. Files are distributed into sets via a dictionary defined in ancor_split.py
**The files in CoNLL format are found in the following location.**
    
* Available at sftp://sfeirj@decore0.imag.fr/home/getalp/dinarelm/work/data/CoreferenceResolution/ANCORE-CoNLL-Fomat

***Used dataset: development, test and training sets for ANCORE under CoNLL format***

For dev : 39 documents and 39 parts.

For test : 108 documents and 108 parts.

For train : 258 documents and 258 parts.

For dev parts: mean length: 1572.8461538461538 , max length: 15306 , 61341 words.

For test parts: mean length: 1387.537037037037 , max length: 15306 , 149854 words.

For train parts: mean length: 1157.0813953488373 , max length: 17877 , 298527 words.

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

**Every row doesn't have the same number of columns (see documentation).**

In [2]:
sets = ["dev", "test", "train"]

In [5]:
#test connexion with server
path = "/run/user/71447/gvfs/sftp:host=decore0.imag.fr,user=sfeirj/home/getalp/dinarelm/work/data/CoreferenceResolution/2012-CoNLL-SharedTask_fromLoic/dev/source/a2e_0000.v4_gold_conll"

with open(path, 'r') as f:
    line = f.readline()
    print(line)
    line = f.readline()
    print(line.split())

#begin document (wb/a2e/00/a2e_0000); part 000

['wb/a2e/00/a2e_0000', '0', '0', 'Celebration', 'NN', '(TOP(S(NP*', '-', '-', '-', '-', '*', '(ARG0*', '-']


## Build that dataframe

### Group files belonging to the same subset together

In [None]:
ancore_path = "/run/user/71447/gvfs/sftp:host=decore0.imag.fr,user=sfeirj/home/getalp/dinarelm/work/data/CoreferenceResolution/ANCOR-CoNLL-Format/data/"
destin_path = "/home/getalp/sfeirj/Work/data/structured_ANCORE/"

In [65]:
ancor_split = {
    "source": "https://sharedocs.huma-num.fr/wl/?id=94jxC3CooJs84JJxfMTZ3jEPxL7E8F65",
    "subcorpora": {
        "train": [
            "004_-1.tei","004_-3.tei","005_C-1.tei","005_C-2.tei","005_C-3.tei","006_C-3.tei","007_C-1.tei","007_C-2.tei","007_C-3.tei","008_C-2.tei","009_C-2.tei","009_C-3.tei","009_C-4.tei","010_C-2.tei","010_C-4.tei","010_C-5.tei","012_C-2.tei","013_C-1.tei","013_C-2.tei","013_C-3.tei","014_C-3.tei","014_C-4.tei","014_C-6.tei","015_C-1.tei","015_C-2.tei","018_C-2.tei","019_C-1.tei","019_C-2.tei","019_C-3.tei","020_C-1.tei","021_C-1.tei","021_C-2.tei","021_C-3.tei","021_C-4.tei","021_C-5.tei","021_C-6.tei","023_C.tei","024_C-3.tei","025_C-1.tei","025_C-2.tei","025_C-3.tei","026_C-2.tei","026_C-4.tei","026_C-5.tei","029_C-1.tei","029_C-2.tei","029_C-3.tei","029_C-4.tei","029_C-5.tei","030_C-3.tei","030_C-4.tei","078_C-2.tei","078_C-4.tei","079_C-1.tei","079_C-2.tei","096_C-2.tei","096_C-3.tei","107_C-1.tei","131_C-1.tei","131_C-2.tei","132_C-3.tei","133_C-1.tei","133_C-2.tei","133_C-3.tei","133_C-4.tei","1AG0368.tei","1AG0390.tei","1AG0391.tei","1AG0502.tei","1AG0508.tei","1AG0509.tei","1AG0511.tei","1AG0513.tei","1AG0521.tei","1AG0523.tei","1AG0527.tei","1AG0529.tei","1AG0531.tei","1AG0534.tei","1AG0543.tei","1AG0545.tei","1AG0549.tei","1AG0552.tei","1AG0553.tei","1AG0554.tei","1AG0555.tei","1AG0558.tei","1AG0563.tei","1AG0564.tei","1AG0565.tei","1AG0566.tei","1AG0570.tei","1AG0576.tei","1AG0577.tei","1AG0585.tei","1AG0618.tei","1AG0620.tei","1AP0099.tei","1AP0115.tei","1AP0119.tei","1AP0120.tei","1AP0129.tei","1AP0131.tei","1AP0134.tei","1AP0137.tei","1AP0142.tei","1AP0143.tei","1AP0149.tei","1AP0152.tei","1AP0168.tei","1AP0170.tei","1AP0171.tei","1AP0179.tei","1AP0182.tei","1AP0197.tei","1AP0204.tei","1AP0206.tei","1AP0217.tei","1AP0218.tei","1AP0219.tei","1AP0226.tei","1AP0228.tei","1AP0229.tei","1AP0230.tei","1AP0233.tei","1AP0234.tei","1AP0239.tei","1AP0242.tei","1AP0243.tei","1AP0245.tei","1AP0246.tei","1AP0248.tei","1AP0270.tei","1AP0273.tei","1AP0274.tei","1AP0275.tei","1AP0279.tei","1AP0286.tei","1AP0289.tei","1AP0294.tei","1AP0296.tei","1AP0308.tei","1AP0309.tei","1AP0310.tei","1AP0316.tei","1AP0317.tei","1AP0319.tei","1AP0322.tei","1AP0323.tei","1AP0324.tei","1AP0325.tei","1AP0329.tei","1AP0341.tei","1AP0366.tei","1AP0381.tei","1AP0384.tei","1AP0417.tei","1AP0496.tei","1AP0506.tei","1AP0507.tei","1AP0544.tei","1AP0596.tei","1AP0605.tei","1AP0606.tei","1AP0608.tei","1NR0591.tei","1PF0416.tei","1PF0419.tei","1PF0420.tei","1PF0421.tei","1PF0423.tei","1PF0424.tei","1PF0427.tei","1PF0429.tei","1PF0432.tei","1PF0439.tei","1PF0442.tei","1PF0444.tei","1PF0445.tei","1PF0447.tei","1PF0448.tei","1PF0449.tei","1PF0450.tei","1PF0454.tei","1PF0455.tei","1PF0457.tei","1PF0462.tei","1PF0463.tei","1PF0464.tei","1PF0468.tei","1PF0470.tei","1PF0472.tei","1PF0474.tei","1PF0479.tei","1PF0487.tei","1PF0490.tei","1PF0491.tei","1PF0494.tei","1PF0572.tei","1PF0638.tei","1PF0639.tei","1PF0641.tei","1PF0644.tei","1PF0646.tei","1PF0648.tei","1PF0650.tei","1PF0654.tei","1SB0087.tei","1SB0088.tei","1SB0111.tei","1SB0124.tei","1SB0126.tei","1SB0164.tei","1SB0167.tei","1SB0172.tei","1SB0175.tei","1SB0177.tei","1SB0185.tei","1SB0190.tei","1SB0208.tei","1SB0210.tei","1SB0211.tei","1SB0213.tei","1SB0214.tei","1SB0223.tei","1SB0231.tei","1SB0252.tei","1SB0253.tei","1SB0264.tei","1SB0265.tei","1SB0266.tei","1SB0284.tei","1SB0300.tei","1SB0302.tei","1SB0311.tei","1SB0320.tei","1SB0338.tei","1SB0339.tei","1SB0344.tei","1SB0355.tei","1SB0379.tei","1SB0396.tei","1SB0397.tei","1SB0403.tei","1SB0405.tei","1SB0408.tei","1SB0411.tei","1SO0623.tei","201_C.tei","215_C.tei","217_C.tei","223_C.tei","2AG0361.tei","2AG0363.tei","2AG0495.tei","2AG0519.tei","2AG0535.tei","2AG0546.tei","2AG0561.tei","2AG0578.tei","2AG0584.tei","2AG0619.tei","2AP0144.tei","2AP0180.tei","2AP0195.tei","2AP0225.tei","2AP0238.tei","2AP0247.tei","2AP0251.tei","2AP0259.tei","2AP0261.tei","2AP0280.tei","2AP0287.tei","2AP0292.tei","2AP0295.tei","2AP0356.tei","2AP0505.tei","2AP0594.tei","2AP0595.tei","2AP0598.tei","2NR0593.tei","2PF0415.tei","2PF0425.tei","2PF0437.tei","2SB0016.tei","2SB0025.tei","2SB0165.tei","2SB0212.tei","2SB0315.tei","3AP0272.tei","3SB0011.tei","4SB0209.tei","542_C-3.tei","5AP0200.tei","CO2_ESLO_003_C.tei"
        ],
        "dev": [
            "004_-4.tei","006_C-2.tei","008_C-3.tei","010_C-3.tei","012_C-3.tei","014_C-5.tei","020_C-3.tei","024_C-1.tei","026_C-3.tei","030_C-1.tei","132_C-1.tei","1AG0499.tei","1AG0536.tei","1AG0624.tei","1AP0096.tei","1AP0117.tei","1AP0133.tei","1AP0146.tei","1AP0196.tei","1AP0227.tei","1AP0240.tei","1AP0255.tei","1PF0418.tei","1PF0438.tei","1PF0460.tei","1PF0467.tei","1PF0649.tei","1SB0123.tei","1SB0186.tei","1SB0289.tei","1SB0633.tei","1SO0612.tei","2AG0526.tei","2AP0205.tei","2AP0307.tei","2PF0413.tei","2SB0166.tei","542_C-2.tei","CO2_ESLO_002_C.tei"
        ],
        "test": [
            "004_-2.tei","004_-4.tei","005_C-1.tei","006_C-1.tei","006_C-2.tei","008_C-1.tei","008_C-3.tei","010_C-1.tei","010_C-3.tei","012_C-1.tei","012_C-3.tei","014_C-1.tei","014_C-5.tei","018_C-1.tei","020_C-2.tei","020_C-3.tei","024_C-1.tei","024_C-4.tei","026_C-1.tei","026_C-3.tei","030_C-1.tei","030_C-2.tei","078_C-3.tei","096_C-1.tei","132_C-1.tei","132_C-2.tei","1AG0141.tei","1AG0154.tei","1AG0155.tei","1AG0157.tei","1AG0359.tei","1AG0362.tei","1AG0364.tei","1AG0365.tei","1AG0367.tei","1AG0499.tei","1AG0536.tei","1AG0624.tei","1AP0005.tei","1AP0007.tei","1AP0029.tei","1AP0033.tei","1AP0041.tei","1AP0054.tei","1AP0056.tei","1AP0060.tei","1AP0061.tei","1AP0069.tei","1AP0071.tei","1AP0073.tei","1AP0074.tei","1AP0078.tei","1AP0079.tei","1AP0080.tei","1AP0083.tei","1AP0084.tei","1AP0089.tei","1AP0096.tei","1AP0117.tei","1AP0133.tei","1AP0146.tei","1AP0196.tei","1AP0227.tei","1AP0240.tei","1AP0255.tei","1NR0592.tei","1PF0048.tei","1PF0050.tei","1PF0052.tei","1PF0159.tei","1PF0162.tei","1PF0305.tei","1PF0306.tei","1PF0358.tei","1PF0392.tei","1PF0393.tei","1PF0414.tei","1PF0418.tei","1PF0438.tei","1PF0460.tei","1PF0467.tei","1PF0649.tei","1SB0002.tei","1SB0009.tei","1SB0018.tei","1SB0022.tei","1SB0024.tei","1SB0027.tei","1SB0028.tei","1SB0030.tei","1SB0031.tei","1SB0042.tei","1SB0043.tei","1SB0123.tei","1SB0186.tei","1SB0289.tei","1SB0633.tei","1SO0488.tei","1SO0612.tei","216_C.tei","2AG0526.tei","2AG0610.tei","2AP0014.tei","2AP0034.tei","2AP0055.tei","2AP0058.tei","2AP0092.tei","2AP0205.tei","2AP0307.tei","2PF0049.tei","2PF0413.tei","2SB0001.tei","2SB0008.tei","2SB0166.tei","3AP0249.tei","3SB0122.tei","542_C-1.tei","542_C-2.tei","CO2_ESLO_001_C.tei","CO2_ESLO_002_C.tei"
        ]}}

In [None]:
for s in sets:
    for filename in ancor_split["subcorpora"][s]:
        correct_filename = filename[:-3]+"conll"
        copyfile(ancore_path+correct_filename, "{}{}/{}".format(destin_path,s,correct_filename))

### Get the data

In [3]:
from collections import defaultdict, Counter
import os

In [5]:
parent_path = "/run/user/71447/gvfs/sftp:host=decore0.imag.fr,user=sfeirj/home/getalp/sfeirj/data/ANCORE/"
dataframes_dict = {}
datalists_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
data_list = []

for i in range(3):
    intermediate_path = parent_path + sets[i] + "/"

    for filename in tqdm(os.listdir(intermediate_path)):

        with open(intermediate_path + filename, 'r') as f:
            iter_readlines = iter(f.readlines())
            for line in iter_readlines:
                if (line != "\n") and ("#end document" not in line) and ("#begin document" not in line): #valid line
                    splitted_line = line[:-1].split()
                    useful_line = splitted_line[:11] + [splitted_line[-1]]
                    datalists_dict[i][useful_line[0]][useful_line[1]].append(useful_line)
                    #datalists_dict[set][doc_id][part_id] is a list of words
                    data_list.append(useful_line)
        #print("***** Finished reading " + filename)
        
    #print(len(data_list))
    #print(data_list[:5])
    #data[i] = pd.DataFrame()
    print("Finished reading " + sets[i])
    dataframes_dict[i] = pd.DataFrame(data_list, columns=['col' + str(elem) for elem in range(12)])
    print("Finished building dataframe for " + sets[i])
    data_list = []

100%|██████████| 39/39 [00:00<00:00, 40.13it/s]
  2%|▎         | 3/120 [00:00<00:04, 27.55it/s]

Finished reading dev
Finished building dataframe for dev


100%|██████████| 120/120 [00:02<00:00, 48.82it/s]


Finished reading test
Finished building dataframe for test


100%|██████████| 295/295 [00:04<00:00, 60.11it/s] 


Finished reading train
Finished building dataframe for train


In [6]:
dataframes_dict[0][:40]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11
0,ANCOR-2AP0307,0,0,bonjour,N,(TOP(S(NP*,-,-,-,-,-,-
1,ANCOR-2AP0307,0,1,monsieur,N,*))),-,-,-,-,-,(1335791777248)
2,ANCOR-2AP0307,0,0,bonjour,N,(TOP(S(NP*,-,-,-,-,-,-
3,ANCOR-2AP0307,0,1,ce,N,*,-,-,-,-,-,-
4,ANCOR-2AP0307,0,2,que,N,*,-,-,-,-,-,-
5,ANCOR-2AP0307,0,3,je,N,*,-,-,-,-,-,(1335791804829)
6,ANCOR-2AP0307,0,4,voudrais,N,*,-,-,-,-,-,-
7,ANCOR-2AP0307,0,5,c',N,*,-,-,-,-,-,-
8,ANCOR-2AP0307,0,6,est,N,*,-,-,-,-,-,-
9,ANCOR-2AP0307,0,7,e,N,*,-,-,-,-,-,-


In [8]:
#documents and parts per set

for i in range(3):
    print("For", sets[i], ":", len(datalists_dict[i]), "documents and", sum([len(val) for val in datalists_dict[i].values()]), "parts.")

For dev : 39 documents and 39 parts.
For test : 108 documents and 108 parts.
For train : 258 documents and 258 parts.


In [9]:
#words per part

for i in range(3):
    length_lists = [len(part) for doc in datalists_dict[i].values() for part in doc.values()]
    print("For", sets[i], "parts: mean length:", np.mean(length_lists), ", max length:", max(length_lists), ",", sum(length_lists), "words.")

For dev parts: mean length: 1572.8461538461538 , max length: 15306 , 61341 words.
For test parts: mean length: 1387.537037037037 , max length: 15306 , 149854 words.
For train parts: mean length: 1157.0813953488373 , max length: 17877 , 298527 words.


**An example call of a document part in data: datalists_dict[0]["ANCOR-1AG0499"]["0"]**

(I guess)

**No document has multiple parts in this dataset:**

In [10]:
for d in datalists_dict[0].keys():
    if len(datalists_dict[0][d]) > 1:
        print(d)
        break

### Get document genre
(I don't know if talking about genre makes sense in ANCORE)

In [14]:
dataframes_dict[0][162:]

Unnamed: 0,col0,col0.1,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11
162,ANCOR-2AP0307,AN,000,1,accord,N,*))),-,-,-,-,-,-
163,ANCOR-2AP0307,AN,000,0,alors,N,(TOP(S(NP*,-,-,-,-,-,-
164,ANCOR-2AP0307,AN,000,1,regardez,N,*,-,-,-,-,-,-
165,ANCOR-2AP0307,AN,000,2,elle,N,*,-,-,-,-,-,(1337441396413)
166,ANCOR-2AP0307,AN,000,3,doit,N,*,-,-,-,-,-,-
167,ANCOR-2AP0307,AN,000,4,être,N,*,-,-,-,-,-,-
168,ANCOR-2AP0307,AN,000,5,indiquée,N,*,-,-,-,-,-,-
169,ANCOR-2AP0307,AN,000,6,je,N,*,-,-,-,-,-,(1337441411045)
170,ANCOR-2AP0307,AN,000,7,pense,N,*,-,-,-,-,-,-
171,ANCOR-2AP0307,AN,000,8,la,N,*,-,-,-,-,-,(1365667811651


In [13]:
for i in range(3):
    dataframes_dict[i].insert(1, "col0.1", pd.Series([doc_id[:2] for doc_id in dataframes_dict[i]["col0"]]))

### Save dataframes

In [15]:
data_parent_path = "../../data/structured_ANCORE/"

for i in range(3):
    data_path = data_parent_path + sets[i] + ".csv"
    dataframes_dict[i].to_csv(data_path)

### Text visualization

In [16]:
for i in range(3):
    for doc in datalists_dict[i].keys(): #doc is a document id
        print("**** doc", doc)
        for part in datalists_dict[i][doc].keys(): #part is a document part id
            print("****** part", part)
            print(" ".join([word[3] for word in datalists_dict[i][doc][part]]))
        print("\n")

**** doc ANCOR-2AP0307
****** part 000
bonjour monsieur bonjour ce que je voudrais c' est e les e brochures sur les gîtes de France les brochures sur les gîtes e ce que je souhaiterai c' est avoir un numéro de téléphone des gîtes de France alors alors soit dans le département ou si vous voulez nous on a e les le guide des gîtes ruraux du département d' Isère mais c' est un guide en vente hein qui coûte trente francs sinon après il faut vous adresser de toute façon c' est la chambre d' agriculture qui nous les fournit hein sinon après voir avec eux directement chambre d' agriculture ah d' accord pour savoir s' ils ont pas je sais pas je sais pas si à titre gracieux ils vous donneront quelque chose mais ce que je souhaite c' est avoir un numéro de téléphone pour pouvoir contacter les des gîtes parce que la Chambre d' Agriculture passez par la Chambre d' Agriculture un organisme quoi d' accord alors regardez elle doit être indiquée je pense la Chambre d' Agriculture voilà Chambre d' Agric

## Coreference resolution interpretation

### POS exploration

Part of this exploration uses `annotation_to_bio_simple()` function which is defined later on.

In [17]:
#a look at pronouns with modulable parameters
dataframes_dict[0][(dataframes_dict[0]['col4'] == "WP$") 
                   #& (dataframes_dict[0]['col3'] == "those")
                   #& (dataframes_dict[0]['col12'] == "B")
                   & (dataframes_dict[0]['col12'] != "O")
                   #& ~(dataframes_dict[0]['col11'].str.contains(')', regex=False))
                   #& (dataframes_dict[0]['col11'].str.contains('(', regex=False))
                   #& pd.Series(np.concatenate((pd.Series(list((dataframes_dict[0]["col3"] == "who")[1:])).values, [(dataframes_dict[0]["col3"] == "who")[0]]))) #check whether next word is "who"
                  ]

KeyError: 'col12'

In [20]:
#a closer look at one of the extracted pronoun occurrences from last cell
idx = 1500
dataframes_dict[0][idx-5 : idx+10][["col3", "col4", "col11"]]

Unnamed: 0,col3,col4,col11
1495,on,N,(1362146480746)
1496,reçoit,N,-
1497,il,N,(1362146488421)
1498,faut,N,-
1499,la,N,-
1500,la,N,(1362146495722)
1501,reconnaître,N,-
1502,il,N,(1362146500729)
1503,faut,N,-
1504,la,N,-


In [103]:
Counter(dataframes_dict[0]["col4"])

Counter({'NNP': 12611,
         'VBD': 5369,
         ',': 6878,
         'CD': 2807,
         'NNS': 8179,
         'JJ': 9237,
         'WP': 846,
         'VBN': 3340,
         'NN': 20021,
         'CC': 4891,
         'VBG': 2852,
         'IN': 17595,
         'DT': 14757,
         '.': 8149,
         'PRP': 7881,
         'VBZ': 4109,
         'PRP$': 1783,
         'POS': 1028,
         'RB': 7661,
         'WDT': 692,
         'JJR': 399,
         'HYPH': 934,
         'RBS': 105,
         '-LRB-': 193,
         '-RRB-': 192,
         'MD': 1832,
         'VB': 5587,
         'NNPS': 408,
         'VBP': 3812,
         'TO': 2364,
         'PDT': 156,
         '``': 860,
         "''": 903,
         ':': 534,
         'EX': 308,
         'WP$': 22,
         'WRB': 660,
         'LS': 19,
         'RP': 638,
         '$': 163,
         'UH': 1710,
         'FW': 78,
         'SYM': 19,
         'JJS': 221,
         'RBR': 199,
         'ADD': 19,
         'NFP': 34,
         'X

In [104]:
for POS in set(dataframes_dict[0]["col4"]):
    print(list(dataframes_dict[0][dataframes_dict[0]['col4'] == POS].iloc[0])[3:5])

['there', 'EX']
['would', 'MD']
['to', 'TO']
['-', 'HYPH']
['--', ':']
['39', 'CD']
['mitbbs.com', 'ADD']
['$', '$']
["''", "''"]
['years', 'NNS']
['He', 'PRP']
['How', 'WRB']
['Yes', 'UH']
['1', 'LS']
['all', 'PDT']
['up', 'RP']
['etc', 'FW']
['succeeds', 'VBZ']
['-RRB-', '-RRB-']
['been', 'VBN']
['``', '``']
['which', 'WDT']
["'s", 'POS']
['old', 'JJ']
['operating', 'VBG']
['the', 'DT']
['and', 'CC']
[',', ',']
['more', 'JJR']
['least', 'RBS']
['for', 'IN']
['Sens.', 'NNPS']
['Drug', 'NNP']
['whose', 'WP$']
['most', 'JJS']
['more', 'RBR']
['thed', 'XX']
['/', 'SYM']
['who', 'WP']
['said', 'VBD']
['his', 'PRP$']
['have', 'VB']
['again', 'RB']
['-LRB-', '-LRB-']
['--', 'NFP']
['.', '.']
['president', 'NN']
['remain', 'VBP']


In [106]:
#info from http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf
#https://cs.nyu.edu/grishman/jet/guide/PennPOS.html
noun_tags = ["NN", "NNS", "NNP", "NNPS", "FW"]
pron_tags = ["PRP", "PRP$", "WP", "WP$", "DT"]

In [107]:
total_counter = Counter([w for i in range(3) for w in dataframes_dict[i]["col4"]]) #all POS tags for all sets

In [108]:
total_counter

Counter({'NNP': 123008,
         'VBD': 51877,
         ',': 68685,
         'CD': 27842,
         'NNS': 81530,
         'JJ': 92602,
         'WP': 9293,
         'VBN': 32511,
         'NN': 200640,
         'CC': 49874,
         'VBG': 28007,
         'IN': 175344,
         'DT': 144054,
         '.': 84534,
         'PRP': 80406,
         'VBZ': 41888,
         'PRP$': 18240,
         'POS': 9877,
         'RB': 74700,
         'WDT': 7400,
         'JJR': 4360,
         'HYPH': 9983,
         'RBS': 933,
         '-LRB-': 2060,
         '-RRB-': 2084,
         'MD': 20049,
         'VB': 58577,
         'NNPS': 4310,
         'VBP': 38172,
         'TO': 24207,
         'PDT': 1536,
         '``': 8700,
         "''": 9175,
         ':': 5296,
         'EX': 2981,
         'WP$': 246,
         'WRB': 6883,
         'LS': 252,
         'RP': 6510,
         '$': 1832,
         'UH': 14049,
         'FW': 780,
         'SYM': 295,
         'JJS': 2277,
         'RBR': 2133,
        

In [109]:
counters = [Counter(dataframes_dict[i]["col4"]) for i in range(3)] #POS tags for each set

In [110]:
for tags in [noun_tags, pron_tags]:
    for j in range(len(tags)):
        print(tags[:j+1], " ----> " * (4-j), [sum([counters[i][tag] for tag in tags[:j+1]]) for i in range(3)])

['NN']  ---->  ---->  ---->  ---->  [20021, 21216, 159403]
['NN', 'NNS']  ---->  ---->  ---->  [28200, 29574, 224396]
['NN', 'NNS', 'NNP']  ---->  ---->  [40811, 42264, 322103]
['NN', 'NNS', 'NNP', 'NNPS']  ---->  [41219, 42723, 325546]
['NN', 'NNS', 'NNP', 'NNPS', 'FW']  [41297, 42820, 326151]
['PRP']  ---->  ---->  ---->  ---->  [7881, 8067, 64458]
['PRP', 'PRP$']  ---->  ---->  ---->  [9664, 10002, 78980]
['PRP', 'PRP$', 'WP']  ---->  ---->  [10510, 10804, 86625]
['PRP', 'PRP$', 'WP', 'WP$']  ---->  [10532, 10842, 86811]


### Parse trees exploration

In [21]:
c = Counter([p.replace(")", "") for tree in list(dataframes_dict[0]["col5"]) for p in tree.split("(")])

In [22]:
for i,j in c.most_common():
    print(j, i)

56957 *
4384 
4384 TOP
4384 S
4384 NP*


In [23]:
dataframes_dict[0][1649:]

Unnamed: 0,col0,col0.1,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11
1649,ANCOR-542,AN,000,32,un,N,*,-,-,-,-,-,(1362146698444
1650,ANCOR-542,AN,000,33,costume,N,*,-,-,-,-,-,1362146698444)
1651,ANCOR-542,AN,000,34,ça,N,*,-,-,-,-,-,-
1652,ANCOR-542,AN,000,35,c',N,*,-,-,-,-,-,-
1653,ANCOR-542,AN,000,36,est,N,*,-,-,-,-,-,-
1654,ANCOR-542,AN,000,37,c',N,*,-,-,-,-,-,-
1655,ANCOR-542,AN,000,38,est,N,*,-,-,-,-,-,-
1656,ANCOR-542,AN,000,39,un,N,*,-,-,-,-,-,-
1657,ANCOR-542,AN,000,40,peu,N,*,-,-,-,-,-,-
1658,ANCOR-542,AN,000,41,normal,N,*,-,-,-,-,-,-


In [274]:
re.findall("h+", "hh h")

['hh', 'h']

In [282]:
dataframes_dict[0][(dataframes_dict[0]["col12"] == "B")
                   & 
                   (dataframes_dict[0]["col5"].str.contains("\(VP\*[)]*$", regex=True))
                   &
                   (dataframes_dict[0]["col4"].str.contains("VB", regex=False))
                  ]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13
813,nw/wsj/24/wsj_2412,0,3,encourage,VB,(VP*,encourage,02,1,-,*,(29),B,B-N
2481,nw/xinhua/03/chtb_0300,0,27,recalling,VBG,(NP(S(VP(VP*,recall,01,-,-,*,(7,B,B-N
2796,nw/wsj/24/wsj_2430,0,20,adding,VBG,(VP*,add,02,1,-,*,(4),B,B-N
3046,nw/xinhua/02/chtb_0280,0,28,met,VBD,(VP(VP*,meet,03,2,-,*,(6),B,B-N
4876,nw/wsj/00/wsj_0089,0,13,fallen,VBN,(VP*,fall,01,2,-,*,(27),B,B-N
4895,nw/wsj/00/wsj_0089,0,8,talk,VB,(VP*,talk,01,1,-,*,(4),B,B-N
5288,nw/wsj/00/wsj_0089,0,21,remains,VBZ,(VP(VP*,remain,01,1,-,*,(10),B,B-N
5578,wb/c2e/00/c2e_0030,0,8,passed,VBD,(VP*,pass,-,-,_conjee_,*,(14),B,B-N
6023,wb/c2e/00/c2e_0030,0,4,opened,VBD,(VP*,open,01,4,_conjee_,*,(2),B,B-N
8450,wb/eng/00/eng_0000,1,33,become,VBP,(VP*,become,01,1,_Alain_DeWitt_,*,(121),B,B-N


In [275]:
idx = 154881
dataframes_dict[0][idx - 5 : idx + 25][["col3", "col4", "col5", "col11"]]

Unnamed: 0,col3,col4,col5,col11
154876,in,IN,(PP*,-
154877,your,PRP$,(NP*,-
154878,community,NN,*))))),-
154879,talking,VBG,(VP*))))),-
154880,/-,.,*)),-
154881,Report,VB,(TOP(S(VP*,(148)
154882,every,DT,(NP(NP*,-
154883,suspicious,JJ,*,-
154884,behavior,NN,*),-
154885,there,EX,(SBAR(S(NP*),-


### Functions

In [38]:
#info from http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf
#https://cs.nyu.edu/grishman/jet/guide/PennPOS.html
noun_tags = ["NN", "NNS", "NNP", "NNPS", "FW"]
pron_tags = ["PRP", "PRP$", "WP", "WP$", "DT"]
verb_tags = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]

In [39]:
#COMBINED IN annotation_to_bio()
#-------------------------------
def annotation_to_bio_simple(l):
    #function which translates last column annotation for coreference resolution to BIO tagging
    #INPUT:  l: list of annotations for a single part of document
    #OUTPUT: l_bio: list of BIO tags of the same length than l
    #note that BIO tagging only considers maximal mentions
    
    l_bio = list(l)
    i = 0
    is_in_max_mention = False
    nb_stacked_mentions = 0
    
    while i < len(l):
        
        #look for beginning
        if l[i] == "-": #didn't find beginning
            l_bio[i] = "O"
            i += 1
        
        else: #found beginning
            is_in_max_mention = True
            nb_stacked_mentions += len(l[i].split("(")) - len(l[i].split(")"))
            l_bio[i] = "B"
            i += 1
            
            if nb_stacked_mentions == 0: #found one-word mention
                is_in_max_mention = False
            
            else: #at least one opened (not closed) mention
            
                while is_in_max_mention:

                    nb_stacked_mentions += len(l[i].split("(")) - len(l[i].split(")"))
                    l_bio[i] = "I"
                    if nb_stacked_mentions == 0:
                        is_in_max_mention = False
                    
                    i += 1
    
    return l_bio

In [40]:
#COMBINED IN annotation_to_bio()
#-------------------------------
def annotation_to_bio_pos(l, pos):
    #function which translates last column annotation for coreference resolution to BIO-POS tagging
    #INPUT:  l: list of annotations for a single part of document
    #INPUT:  pos: list of part-of-speech indications
    #OUTPUT: l_bio: list of BIO tags of the same length than l
    #note that BIO tagging only considers maximal mentions
    
    l_bio = list(l)
    i = 0
    is_in_max_mention = False
    nb_stacked_mentions = 0
    pos_type = ""
    
    while i < len(l):
        
        #look for beginning
        
        if l[i] == "-": #didn't find beginning
            l_bio[i] = "O"
            i += 1
        
        else: #found beginning
            is_in_max_mention = True
            nb_stacked_mentions += len(l[i].split("(")) - len(l[i].split(")"))
            
            #find POS type
            if pos[i] == "PRP":
                pos_type = "P"
            elif (pos[i] in ["PRP$", "DT"]) and (nb_stacked_mentions == 0): #found one-word mention starting with PRP$ or DT
                pos_type = "P"
            else:
                pos_type = "N"
            
            l_bio[i] = "B-" + pos_type
            i += 1
            
            if nb_stacked_mentions == 0: #found one-word mention
                is_in_max_mention = False
            
            else: #at least one opened (not closed) mention
            
                while is_in_max_mention:

                    nb_stacked_mentions += len(l[i].split("(")) - len(l[i].split(")"))
                    l_bio[i] = "I-" + pos_type
                    if nb_stacked_mentions == 0:
                        is_in_max_mention = False
                    
                    i += 1
    
    return l_bio

In [41]:
def annotation_to_bio(*args):
    #function which translates last column annotation for coreference resolution to BIO or BIO-POS tagging
    #INPUT:  l: list of annotations for a single part of document
    #INPUT?: pos: list of part-of-speech indications
    #OUTPUT: l_bio: list of BIO tags of the same length than l
    #note that BIO tagging strategy depends on number of arguments
    #note that BIO tagging only considers maximal mentions
    
    
    def bio_strategy_inside_mention(beg, end, args):
        bio = ["B"] + ["I"] * (end - beg - 1)
        if len(args) >= 2: #strategy: BIO + POS / BIO + POS + parse
            #find tag type
            pos = args[1]
            if (len(args) == 3) and (bool(re.search("\(VP\*[)]*$", args[2][beg]))):
                tag_type = "V"
            elif pos[beg] == "PRP":
                tag_type = "P"
            elif (pos[beg] in ["PRP$", "DT"]) and (len(pos) == 1): #found one-word mention starting with PRP$ or DT
                tag_type = "P"
            else:
                tag_type = "N"
            #set tags
            bio[0] += "-" + tag_type
            j = beg + 1
            while j < end:
                bio[j - beg] += "-" + tag_type
                j += 1
        return bio
    
    
    l = args[0]
    l_bio = list(l)
    i = 0
    is_in_max_mention = False
    nb_stacked_mentions = 0
    pos_type = ""
    
    while i < len(l):
        
        #look for beginning

        if l[i] == "-": #didn't find beginning
            l_bio[i] = "O"
            i += 1
        
        else: #found beginning
            
            is_in_max_mention = True
            j = i 
            #j: start of the mention
            #i: end of the mention
            
            while is_in_max_mention:
                nb_stacked_mentions += len(l[i].split("(")) - len(l[i].split(")"))
                is_in_max_mention = (nb_stacked_mentions != 0)
                i += 1
                
            l_bio[j:i] = bio_strategy_inside_mention(j, i, args) #only replace tags for the mention's words
            
    return l_bio

In [42]:
def annotation_to_dict(l, l_docs, l_parts, l_strings):
    #function which translates last column annotation for coreference resolution to dictionary of coreferent mentions
    #INPUT: l:   list of annotations for a single part of document
    #INPUT: s: set you are working on
    #OUTPUT: datacorefids_dict: hierarchical dictionary containing for each entity its coreferent mentions
    #OUTPUT: datacorefstrings_dict: hierarchical dictionary containing for each entity its coreferent mentions
    #note that BIO tagging only considers maximal mentions
    
    datacorefids_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    datacorefstrings_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    i = 0
    is_in_mention = False
    entity_id = -1
    nb_stacked_mentions = 0
    beginnings_re = '\([0-9]+'
    numbers_re = '[0-9]+'
    
    while i < len(l):
        
        if l[i] != "-":
        
            for beginning in re.findall(beginnings_re, l[i]): #loop over mention beginnings
                ##find entity id
                entity_id = re.findall(numbers_re, beginning)[0]
                
                ##look for mention end (j being its index)
                is_in_mention = True
                j = i
                
                #computing nb_stacked_mentions is different for the first word's annotation of each mention
                popped_annotation = l[i][l[i].find(entity_id):]
                nb_stacked_mentions = 1 + len(popped_annotation.split("(")) - len(popped_annotation.split(")"))

                j += 1
                
                if nb_stacked_mentions <= 0: #found one-word mention
                    is_in_max_mention = False
                    
                else:
                
                    while is_in_mention:

                        nb_stacked_mentions += len(l[j].split("(")) - len(l[j].split(")"))
                        j += 1

                        if nb_stacked_mentions <= 0: #stop condition: add mention to output dictionaries
                            is_in_mention = False
                            
                #row = dataframes_dict[s].iloc[i]
                #datacoref_dict[row['col0']][row['col1']][entity_id].append(list(range(i, j)))
                datacorefids_dict[l_docs[i]][l_parts[i]][entity_id].append(list(range(i, j)))
                mention = " ".join(l_strings[i:j])
                datacorefstrings_dict[l_docs[i]][l_parts[i]][entity_id].append(mention)                
        
        i += 1
    
    return datacorefids_dict, datacorefstrings_dict

### Examples

In [43]:
test_coref_data = ["-", "(4)", "-", "(1", "(4)", "(6", "-", "(2", "(7)|2)|6)", "-", "1)", "-"]

In [44]:
test_coref_data

['-', '(4)', '-', '(1', '(4)', '(6', '-', '(2', '(7)|2)|6)', '-', '1)', '-']

In [45]:
annotation_to_bio_simple(test_coref_data)

['O', 'B', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'O']

In [46]:
annotation_to_bio(test_coref_data)

['O', 'B', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'O']

In [47]:
pos = ["UH", "PRP", "VB", "DT", "NNP", "JJ", "CC", "JJ", "NN", "RB", "RB", "SYM"]
parse = ["", "", "", "(VP*)", "", "", "", "", "", "", "", ""]

In [48]:
annotation_to_bio_pos(test_coref_data, pos)

['O', 'B-P', 'O', 'B-N', 'I-N', 'I-N', 'I-N', 'I-N', 'I-N', 'I-N', 'I-N', 'O']

In [49]:
annotation_to_bio(test_coref_data, pos)

['O', 'B-P', 'O', 'B-N', 'I-N', 'I-N', 'I-N', 'I-N', 'I-N', 'I-N', 'I-N', 'O']

In [50]:
annotation_to_bio(test_coref_data, pos, parse)

['O', 'B-P', 'O', 'B-V', 'I-V', 'I-V', 'I-V', 'I-V', 'I-V', 'I-V', 'I-V', 'O']

In [51]:
docs = [2]*12
parts = [1]*12
strings = ["hey", "michael", "is", "a", "Queen", "fan", "with", "bad", "hairstyle", "even", "today", "!"]

In [52]:
clusters, cluster_strings = annotation_to_dict(test_coref_data, docs, parts, strings)

In [53]:
clusters

defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>()>,
            {2: defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>.<locals>.<lambda>()>,
                         {1: defaultdict(list,
                                      {'4': [[1], [4]],
                                       '1': [[3, 4, 5, 6, 7, 8, 9, 10]],
                                       '6': [[5, 6, 7, 8]],
                                       '2': [[7, 8]],
                                       '7': [[8]]})})})

In [54]:
cluster_strings

defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>()>,
            {2: defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>.<locals>.<lambda>()>,
                         {1: defaultdict(list,
                                      {'4': ['michael', 'Queen'],
                                       '1': ['a Queen fan with bad hairstyle even today'],
                                       '6': ['fan with bad hairstyle'],
                                       '2': ['bad hairstyle'],
                                       '7': ['hairstyle']})})})

### Application of functions and addition to dataframes

#### `annotation_to_bio_simple`

In [33]:
for i in range (3):
    dataframes_dict[i]["col12"] = pd.Series(annotation_to_bio_simple(list(dataframes_dict[i]["col11"])))

In [36]:
dataframes_dict[0]

Unnamed: 0,col0,col0.1,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14
0,ANCOR-2AP0307,AN,000,0,bonjour,N,(TOP(S(NP*,-,-,-,-,-,-,O,O,O
1,ANCOR-2AP0307,AN,000,1,monsieur,N,*))),-,-,-,-,-,(1335791777248),B,B-N,B-N
2,ANCOR-2AP0307,AN,000,0,bonjour,N,(TOP(S(NP*,-,-,-,-,-,-,O,O,O
3,ANCOR-2AP0307,AN,000,1,ce,N,*,-,-,-,-,-,-,O,O,O
4,ANCOR-2AP0307,AN,000,2,que,N,*,-,-,-,-,-,-,O,O,O
5,ANCOR-2AP0307,AN,000,3,je,N,*,-,-,-,-,-,(1335791804829),B,B-N,B-N
6,ANCOR-2AP0307,AN,000,4,voudrais,N,*,-,-,-,-,-,-,O,O,O
7,ANCOR-2AP0307,AN,000,5,c',N,*,-,-,-,-,-,-,O,O,O
8,ANCOR-2AP0307,AN,000,6,est,N,*,-,-,-,-,-,-,O,O,O
9,ANCOR-2AP0307,AN,000,7,e,N,*,-,-,-,-,-,-,O,O,O


#### `annotation_to_bio`

In [55]:
# Using POS and parse trees
for i in range (3):
    dataframes_dict[i]["col13"] = pd.Series(annotation_to_bio(list(dataframes_dict[i]["col11"]), list(dataframes_dict[i]["col4"])))
    dataframes_dict[i]["col14"] = pd.Series(annotation_to_bio(list(dataframes_dict[i]["col11"]), list(dataframes_dict[i]["col4"]), list(dataframes_dict[i]["col5"])))

In [56]:
dataframes_dict[0][1629:1639][["col3", "col4", "col5", "col12", "col13", "col14"]]

Unnamed: 0,col3,col4,col5,col12,col13,col14
1629,placé,N,*,O,O,O
1630,que,N,*,O,O,O
1631,je,N,*,B,B-N,B-N
1632,me,N,*,O,O,O
1633,présente,N,*,O,O,O
1634,en,N,*,O,O,O
1635,polo,N,*,B,B-N,B-N
1636,devant,N,*,O,O,O
1637,un,N,*,B,B-N,B-N
1638,client,N,*,I,I-N,I-N


#### `annotation_to_dict`

In [57]:
datacoref_dict = defaultdict(lambda: defaultdict())
for i in range(3):
    datacoref_dict[i]["indexes"], datacoref_dict[i]["strings"] = annotation_to_dict(
        list(dataframes_dict[i]["col11"]), 
        list(dataframes_dict[i]["col0"]), 
        list(dataframes_dict[i]["col1"]), 
        list(dataframes_dict[i]["col3"]))

In [58]:
#look at entities indexes
new_def_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for s in range(3):
    for doc in datacoref_dict[s]["strings"].keys():
        for part in datacoref_dict[s]["strings"][doc].keys():
            new_def_dict[s][doc][part].append(list(datacoref_dict[s]["strings"][doc][part].keys()))

In [59]:
new_def_dict[0]['ANCOR-1AG0499']

defaultdict(list,
            {'000': [['1329471169811',
               '1329471496224',
               '1329468279226',
               '1329468331186',
               '1329468291496',
               '1329468349806',
               '1329471177128',
               '1330360555583',
               '1329471182354',
               '1329471185630',
               '1329468371556',
               '1329468427107',
               '1329471535022',
               '1329468444057',
               '1329471544694',
               '1329471228296',
               '1329471557439',
               '1329471236751',
               '1329471549467',
               '1329468467225',
               '1329468587706',
               '1329838099171',
               '1329838124775',
               '1329471250198',
               '1329469817732',
               '1329469875795',
               '1329838136857',
               '1329838144156',
               '1329469955339',
               '1329838151197',
               

In [60]:
new_list = [int(i) for _, part in new_def_dict[0]['wb/eng/00/eng_0000'].items() for i in part[0]]
new_list.sort()
new_list == list(range(len(new_list)))

True

In [61]:
datacoref_dict[0]["indexes"]

defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>()>,
            {'ANCOR-2AP0307': defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'000': defaultdict(list,
                                      {'1335791777248': [[1]],
                                       '1335791804829': [[5]],
                                       '1335791863017': [[10, 11, 12]],
                                       '1335791886495': [[14, 15, 16, 17]],
                                       '1335791906603': [[17]],
                                       '1335791949378': [[18, 19]],
                                       '1337442607875': [[21, 22]],
                                       '1335792019469': [[26]],
                                       '1335792059031': [[31,
                                         32,
                                         33,
                                         34,
                      

**For each document, entities are indexed starting from 0.**

In [62]:
datacoref_dict[0]["strings"]

defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>()>,
            {'ANCOR-2AP0307': defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'000': defaultdict(list,
                                      {'1335791777248': ['monsieur'],
                                       '1335791804829': ['je'],
                                       '1335791863017': ['les e brochures'],
                                       '1335791886495': ['les gîtes de France'],
                                       '1335791906603': ['France'],
                                       '1335791949378': ['les brochures'],
                                       '1337442607875': ['les gîtes'],
                                       '1335792019469': ['je'],
                                       '1335792059031': ['un numéro de téléphone des gîtes de France'],
                                       '1337593257257': ['téléphone'],
            

In [63]:
datacoref_dict[0]["indexes"]

defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>()>,
            {'ANCOR-2AP0307': defaultdict(<function __main__.annotation_to_dict.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'000': defaultdict(list,
                                      {'1335791777248': [[1]],
                                       '1335791804829': [[5]],
                                       '1335791863017': [[10, 11, 12]],
                                       '1335791886495': [[14, 15, 16, 17]],
                                       '1335791906603': [[17]],
                                       '1335791949378': [[18, 19]],
                                       '1337442607875': [[21, 22]],
                                       '1335792019469': [[26]],
                                       '1335792059031': [[31,
                                         32,
                                         33,
                                         34,
                      

### Save dataframes, reprise

In [64]:
data_parent_path = "../../data/structured_ANCORE/"

for i in range(3):
    data_path = data_parent_path + sets[i] + ".csv"
    dataframes_dict[i].to_csv(data_path)

## Features extraction

* A mention is represented by a list of successive word indexes, such as those in `datacoref_dict[0]['indexes']`.
* A word index identifies it within a set (not only within a document or a part).
* For features related to two mentions, these should be in the same document and in the same part (there is no use in comparing two mentions from different documents since they cannot be coreferent).

In [348]:
def get_mention_length(m):
    #INPUT: a list of successive word indexes
    return (1 + m[-1] - m[0])

In [352]:
def get_mentions_distance(m1, m2):
    #INPUTS: two lists of successive word indexes each
    return (m2[0] - m1[-1] - 1) if (m2[0] >= m1[0]) else (m1[0] - m2[-1] - 1)

In [None]:
def get_mention_gender(m):
    return

In [356]:
def get_mention_number(m):
    return

## Draft

### Goal here is to try and find a representation for the different lengths of datalists_dict structure

In [172]:
def get_nb_of_sons(d):
    first_element = list(d.keys())[0]
    print(d.keys())
    if type(d[first_element]) is defaultdict:
        value = sum([get_nb_of_sons(d[k]) for k in d.keys()])
    else:
        value = len(d.keys())
    return value

In [145]:
test_dict = {1:{1:{1:0, 2:0},
                   2:{1:0, 2:0, 3:0}},
               2:{1:{1:0, 2:0},
                  2:{1:0, 2:0}}}

In [146]:
get_nb_of_sons(test_dict)

dict_keys([1, 2])


2

In [147]:
test_dict.keys()

dict_keys([1, 2])

In [173]:
get_nb_of_sons(datalists_dict)

dict_keys([0, 1, 2, 'nw/wsj/24/wsj_2408', 'nw/wsj/24/wsj_2412', 'nw/xinhua/01/chtb_0150', 'bn/cnn/03/cnn_0360', 'nw/xinhua/03/chtb_0300', 'bn/cnn/03/cnn_0300', 'nw/wsj/24/wsj_2430', 'nw/xinhua/02/chtb_0280', 'nw/xinhua/02/chtb_0220', 'bn/voa/00/voa_0080', 'nw/wsj/24/wsj_2401', 'nw/wsj/00/wsj_0089', 'wb/c2e/00/c2e_0030', 'bn/cnn/02/cnn_0280', 'bn/cnn/01/cnn_0120', 'bn/cnn/01/cnn_0140', 'bn/cnn/04/cnn_0410', 'bn/cnn/04/cnn_0430', 'wb/eng/00/eng_0000', 'nw/wsj/24/wsj_2405', 'nw/wsj/24/wsj_2418', 'bn/cnn/00/cnn_0060', 'bn/pri/01/pri_0100', 'nw/wsj/24/wsj_2425', 'wb/a2e/00/a2e_0010', 'pt/nt/46/nt_4610', 'bn/cnn/00/cnn_0010', 'nw/wsj/22/wsj_2278', 'bn/pri/00/pri_0040', 'bc/phoenix/00/phoenix_0000', 'nw/xinhua/00/chtb_0020', 'wb/c2e/00/c2e_0020', 'bn/cnn/01/cnn_0130', 'bn/cnn/02/cnn_0270', 'nw/wsj/24/wsj_2422', 'mz/sinorama/10/ectb_1010', 'nw/wsj/24/wsj_2400', 'nw/wsj/24/wsj_2445', 'bn/cnn/02/cnn_0210', 'nw/wsj/24/wsj_2444', 'nw/wsj/24/wsj_2428', 'mz/sinorama/10/ectb_1060', 'mz/sinorama/10/ec

dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', 0])
dict_keys([0])
dict_keys(['0', 0])
dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', 0])
dict_keys([0])
dict_keys(['0', 0])
dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', 0])
dict_keys([0])
dict_keys(['0', 0])
dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', 0])

dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', 0])
dict_keys([0])
dict_keys(['0', 0])
dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', 0])
dict_keys([0])
dict_keys(['0', 0])
dict_keys(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', 0])
dict_keys([0])
dict_keys(['0', '1', '2'

164201

In [171]:
get_nb_of_sons(datalists_dict[0]['nw/wsj/24/wsj_2408'])

defaultdict(<function <lambda>.<locals>.<lambda>.<locals>.<lambda> at 0x7f15566998c8>, {'0': defaultdict(<class 'list'>, {'0': ['nw/wsj/24/wsj_2408', '0', '0', 'Robert', 'NNP', '(TOP(S(NP(NP(NP*', '-', '-', '-', '-', '(PERSON*', '-'], '1': ['nw/wsj/24/wsj_2408', '0', '1', 'E.', 'NNP', '*', '-', '-', '-', '-', '*', '-'], '2': ['nw/wsj/24/wsj_2408', '0', '2', 'Lyons', 'NNP', '*', '-', '-', '-', '-', '*', '-'], '3': ['nw/wsj/24/wsj_2408', '0', '3', 'III', 'NNP', '*)', '-', '-', '-', '-', '*)', '-'], '4': ['nw/wsj/24/wsj_2408', '0', '4', ',', ',', '*', '-', '-', '-', '-', '*', '-'], '5': ['nw/wsj/24/wsj_2408', '0', '5', '39', 'CD', '(NP*))', '-', '-', '-', '-', '(DATE)', '-'], '6': ['nw/wsj/24/wsj_2408', '0', '6', ',', ',', '*', '-', '-', '-', '-', '*', '-'], '7': ['nw/wsj/24/wsj_2408', '0', '7', 'who', 'WP', '(SBAR(WHNP*)', '-', '-', '-', '-', '*', '-'], '8': ['nw/wsj/24/wsj_2408', '0', '8', 'headed', 'VBD', '(S(VP*', 'head', '01', '2', '-', '*', '-'], '9': ['nw/wsj/24/wsj_2408', '0', '9'

37

### Goal here is to try and save data for MD's use while keeping spaces at sentences end

In [72]:
marco_parent_path = "/run/user/71447/gvfs/sftp:host=decore0.imag.fr,user=sfeirj/home/getalp/sfeirj/data/ANCORE/data_for_marco"

In [71]:
for i in range(3):
    with f.open("{}/{}.txt".format(marco_parent_path, sets[i], "a")
        dataframes_dict[i][["col3", "col12"]]

Unnamed: 0,col3,col12
0,bonjour,O
1,monsieur,B
2,bonjour,O
3,ce,O
4,que,O
5,je,B
6,voudrais,O
7,c',O
8,est,O
9,e,O


In [73]:
sets

['dev', 'test', 'train']

In [99]:
parent_path = "/run/user/71447/gvfs/sftp:host=decore0.imag.fr,user=sfeirj/home/getalp/sfeirj/data/ANCORE/"
my_dataframes_dict = {}
data_list = []

for i in range(3):
    intermediate_path = parent_path + sets[i] + "/"

    for filename in tqdm(os.listdir(intermediate_path)):

        with open(intermediate_path + filename, 'r') as f:
            iter_readlines = iter(f.readlines())
            for line in iter_readlines:
                if (line == "\n"):
                    data_list.append("")
                elif ("#end document" not in line) and ("#begin document" not in line): #valid line
                    splitted_line = line[:-1].split()
                    useful_line = splitted_line[:11] + [splitted_line[-1]]
                    data_list.append(useful_line)

    print("Finished reading " + sets[i])
    my_dataframes_dict[i] = pd.DataFrame(data_list, columns=['col' + str(elem) for elem in range(12)])
    print("Finished building dataframe for " + sets[i])
    data_list = []

100%|██████████| 39/39 [00:00<00:00, 79.06it/s]
  0%|          | 0/120 [00:00<?, ?it/s]

Finished reading dev
Finished building dataframe for dev


100%|██████████| 120/120 [00:01<00:00, 75.56it/s]
  0%|          | 0/295 [00:00<?, ?it/s]

Finished reading test
Finished building dataframe for test


100%|██████████| 295/295 [00:03<00:00, 86.19it/s] 


Finished reading train
Finished building dataframe for train


In [100]:
my_dataframes_dict[0][:100]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11
0,ANCOR-2AP0307,000,0,bonjour,N,(TOP(S(NP*,-,-,-,-,-,-
1,ANCOR-2AP0307,000,1,monsieur,N,*))),-,-,-,-,-,(1335791777248)
2,,,,,,,,,,,,
3,ANCOR-2AP0307,000,0,bonjour,N,(TOP(S(NP*,-,-,-,-,-,-
4,ANCOR-2AP0307,000,1,ce,N,*,-,-,-,-,-,-
5,ANCOR-2AP0307,000,2,que,N,*,-,-,-,-,-,-
6,ANCOR-2AP0307,000,3,je,N,*,-,-,-,-,-,(1335791804829)
7,ANCOR-2AP0307,000,4,voudrais,N,*,-,-,-,-,-,-
8,ANCOR-2AP0307,000,5,c',N,*,-,-,-,-,-,-
9,ANCOR-2AP0307,000,6,est,N,*,-,-,-,-,-,-


In [112]:
def my_annotation_to_bio_simple(l):
    #function which translates last column annotation for coreference resolution to BIO tagging
    #INPUT:  l: list of annotations for a single part of document
    #OUTPUT: l_bio: list of BIO tags of the same length than l
    #note that BIO tagging only considers maximal mentions
    
    l_bio = list(l)
    i = 0
    is_in_max_mention = False
    nb_stacked_mentions = 0
    
    while i < len(l):
        
        #look for end of sentences
        if l[i] == None:
            print("PARTY")
            l_bio[i] = ""
            i += 1
        
        #look for beginning
        elif l[i] == "-": #didn't find beginning
            l_bio[i] = "O"
            i += 1
        
        else: #found beginning
            is_in_max_mention = True
            nb_stacked_mentions += len(l[i].split("(")) - len(l[i].split(")"))
            l_bio[i] = "B"
            i += 1
            
            if nb_stacked_mentions == 0: #found one-word mention
                is_in_max_mention = False
            
            else: #at least one opened (not closed) mention
            
                while is_in_max_mention:

                    nb_stacked_mentions += len(l[i].split("(")) - len(l[i].split(")"))
                    l_bio[i] = "I"
                    if nb_stacked_mentions == 0:
                        is_in_max_mention = False
                    
                    i += 1
    
    return l_bio

In [113]:
for i in range (3):
    my_dataframes_dict[i]["col12"] = pd.Series(my_annotation_to_bio_simple(list(my_dataframes_dict[i]["col11"])))

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PARTY
PART

In [115]:
my_dataframes_dict[0][:35]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12
0,ANCOR-2AP0307,0.0,0.0,bonjour,N,(TOP(S(NP*,-,-,-,-,-,-,O
1,ANCOR-2AP0307,0.0,1.0,monsieur,N,*))),-,-,-,-,-,(1335791777248),B
2,,,,,,,,,,,,,
3,ANCOR-2AP0307,0.0,0.0,bonjour,N,(TOP(S(NP*,-,-,-,-,-,-,O
4,ANCOR-2AP0307,0.0,1.0,ce,N,*,-,-,-,-,-,-,O
5,ANCOR-2AP0307,0.0,2.0,que,N,*,-,-,-,-,-,-,O
6,ANCOR-2AP0307,0.0,3.0,je,N,*,-,-,-,-,-,(1335791804829),B
7,ANCOR-2AP0307,0.0,4.0,voudrais,N,*,-,-,-,-,-,-,O
8,ANCOR-2AP0307,0.0,5.0,c',N,*,-,-,-,-,-,-,O
9,ANCOR-2AP0307,0.0,6.0,est,N,*,-,-,-,-,-,-,O


In [77]:
#write files for Marco
for i in range(3):
    my_dataframes_dict[i][["col3", "col12"]].to_csv("{}/{}.txt".format(marco_parent_path, sets[i]), index=False, header=False)

In [119]:
for i in range(3):
    with open("{}/{}.txt".format(marco_parent_path, sets[i]), 'w') as f:
        for idx,row in my_dataframes_dict[i].iterrows():
            if row["col3"] != None:
                f.write("{},{}\n".format(row["col3"],row["col12"]))
            else:
                f.write("\n")