In [343]:
import pandas as pd
import numpy as np
import os
import re
import string
import xml.etree.ElementTree as ET
import json
import random
from collections import defaultdict

# Exploring

In [172]:
positionings = pd.read_csv("positionings.csv", delimiter='\t')

In [173]:
positionings_6_columns = positionings.iloc[:, :6]

In [174]:
positionings_6_columns.head()

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw
0,14,wsj_1485,320..324,Expansion.Conjunction,,also
1,14,wsj_1461,402..406,Expansion.Conjunction,,with
2,14,wsj_1461,782..785,Comparison.Concession.Arg2-as-denier,,but
3,14,wsj_1461,1123..1125,Comparison.Similarity,,as
4,14,wsj_1416,331..343,Contingency.Cause.Result,,consequently


In [175]:
positionings_6_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24238 entries, 0 to 24237
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   wsj_section  24238 non-null  int64 
 1   wsj_doc      24238 non-null  object
 2   DcOffset     24238 non-null  object
 3   Label1       24238 non-null  object
 4   Label2       1167 non-null   object
 5   DcRaw        24238 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.1+ MB


In [179]:
# Identified wrong entry in the dataframe where there is multiple offsets but only one connector
dd = positionings_6_columns[positionings_6_columns['DcOffset'].str.contains(';')].copy()
filtered_df = dd[dd['DcRaw'].apply(lambda x: len(x.split()) == 1)]
filtered_df.head()

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw
935,14,wsj_1469,6474..6486;6497..6501,Temporal.Synchronous,Contingency.Cause.Reason,when


In [180]:
#modify row 935 to only have one offset
positionings_6_columns.loc[935, "DcOffset"] = "6497..6501"
positionings_6_columns.loc[935]

wsj_section                          14
wsj_doc                        wsj_1469
DcOffset                     6497..6501
Label1             Temporal.Synchronous
Label2         Contingency.Cause.Reason
DcRaw                              when
Name: 935, dtype: object

In [181]:
filtered_df = positionings_6_columns[positionings_6_columns['DcOffset'].str.count(';') >= 2]
filtered_df

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw
3738,21,wsj_2155,3107..3115;3152..3155;3160..3164,Expansion.Conjunction,,not only but also
10477,10,wsj_1002,5828..5836;5877..5880;5885..5889,Expansion.Conjunction,,not only but also
18790,11,wsj_1187,4899..4907;4965..4968;4979..4983,Expansion.Conjunction,,not only but also


In [182]:
len(positionings_6_columns)

24238

# Preprocess

In [453]:
positionings_6_columns_clean = positionings_6_columns.copy()
#cloning the DcRaw column for preserving statistics purposes 
positionings_6_columns_clean['DcRaw_og'] = positionings_6_columns_clean['DcRaw'].copy()

In [454]:
positionings_6_columns_clean

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw,DcRaw_og
0,14,wsj_1485,320..324,Expansion.Conjunction,,also,also
1,14,wsj_1461,402..406,Expansion.Conjunction,,with,with
2,14,wsj_1461,782..785,Comparison.Concession.Arg2-as-denier,,but,but
3,14,wsj_1461,1123..1125,Comparison.Similarity,,as,as
4,14,wsj_1416,331..343,Contingency.Cause.Result,,consequently,consequently
...,...,...,...,...,...,...,...
24233,24,wsj_2404,2939..2949,Expansion.Conjunction,,separately,separately
24234,24,wsj_2441,880..884,Expansion.Conjunction,,also,also
24235,24,wsj_2441,866..873,Contingency.Cause.Reason,,because,because
24236,24,wsj_2436,421..424,Comparison.Concession.Arg2-as-denier,,but,but


In [471]:
filtered_df = positionings_6_columns_clean[positionings_6_columns_clean['Label1'].str.endswith(('Reason+SpeechAct'), na=False)]
filtered_df

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw,DcRaw_og
17207,15,wsj_1560,1437..1440,Contingency.Cause+SpeechAct.Reason+SpeechAct,,but,but


## Cropping the third label

In [290]:
# crop the third label in Label1 column
positionings_6_columns_clean['Label1'] = positionings_6_columns_clean['Label1'].str.split('.').str[1]
positionings_6_columns_clean['Label2'] = positionings_6_columns_clean['Label2'].str.split('.').str[1]
positionings_6_columns_unsplitted = positionings_6_columns_clean.copy()
positionings_6_columns_clean[['Label1','Label2']]

Unnamed: 0,Label1,Label2
0,Conjunction,
1,Conjunction,
2,Concession,
3,Similarity,
4,Cause,
...,...,...
24233,Conjunction,
24234,Conjunction,
24235,Cause,
24236,Concession,


In [292]:
positionings_6_columns_clean["Label1"].nunique()

21

In [293]:
dd = positionings_6_columns[positionings_6_columns['DcOffset'].str.contains(';')].copy()
len(dd)

123

In [294]:
positionings_6_columns_clean[positionings_6_columns_clean['DcRaw'].str.contains(' ')]

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw,DcRaw_og
5,14,wsj_1416,1221..1228,Conjunction,,in fact,in fact
12,14,wsj_1416,2452..2463,Concession,,even though,even though
16,14,wsj_1424,748..759,Instantiation,,for example,for example
20,14,wsj_1424,2806..2812,Cause,,due to,due to
21,14,wsj_1424,2886..2896,Synchronous,,as long as,as long as
...,...,...,...,...,...,...,...
24145,24,wsj_2428,7083..7090,Level-of-detail,,in that,in that
24153,24,wsj_2428,8475..8486,Conjunction,,in addition,in addition
24167,24,wsj_2417,807..818,Concession,,even though,even though
24184,24,wsj_2417,3986..3997,Conjunction,,in addition,in addition


In [295]:
# step 1 split the offset column
def splitting_multiple_offsets(raw_data_frame, is_cz = False):
    # New row splitting approach
    split_rows = raw_data_frame[raw_data_frame['DcRaw'].str.contains(' ')].copy()
    split_rows['DcRaw'] = split_rows['DcRaw'].str.split()

    # first step is splitting all the rows that have ";"" and then make 1 pass to make all of the dataframe have 1 offset
    split_rows['DcOffset'] = split_rows['DcOffset'].apply(lambda x: x.split(';') if ';' in x else [x]) 
    dfs = []
    orignal_indicies = []
    for index, row in split_rows.iterrows():
        # we will first work on only the rows that have multiple offsets (2 and 3 offsets seperated by ";")
            if len(row["DcOffset"]) > 1:
                # save the row data
                if not is_cz:    
                    wsj_section = row['wsj_section']
                    wsj_doc = row['wsj_doc']
                    label1 = row['Label1']
                    label2 = row['Label2']
                    dcRaw = row["DcRaw_og"]
                else:
                    label1 = row['Label1']
                # initialize helpful variables
                word_pointer = 0
                connectors = row["DcRaw"]
                # iterate over offsets
                for offset in row["DcOffset"]:
                    start_offset = int(offset.split("..")[0])
                    end_offset = int(offset.split("..")[1])
                    # list to store the connectors in their respective offset
                    words_per_offset = []
                    # variable to check if you exceeded the offet length
                    offset_length_count = start_offset
                    # loop over the connectors in the row and see if adding the length of the word to the count will fit 
                    while word_pointer < len(connectors):
                        if offset_length_count + len(connectors[word_pointer]) <= end_offset:
                            # if it does add it to the list and point to the next word and update counter
                            words_per_offset.append(connectors[word_pointer])                                                
                            offset_length_count += len(connectors[word_pointer]) + 1 # considering the space length
                            word_pointer += 1

                        else:
                            # add the list to the offset in a single row
                            if not is_cz:
                                row_adjusted = {'wsj_section': wsj_section,
                                                'wsj_doc': wsj_doc,
                                                'DcOffset': offset,
                                                'Label1': label1,
                                                'Label2': label2,
                                                'DcRaw_og': dcRaw,
                                                'DcRaw': ' '.join(words_per_offset)}
                            else:
                                row_adjusted = {'DcOffset': offset,
                                                'Label1': label1,
                                                'DcRaw': ' '.join(words_per_offset)}

                            dfs.append(row_adjusted)
                            words_per_offset = []
                            break

                    # if not empty add the last word in the list
                    if words_per_offset:
                        if not is_cz:
                                row_adjusted = {'wsj_section': wsj_section,
                                                'wsj_doc': wsj_doc,
                                                'DcOffset': offset,
                                                'Label1': label1,
                                                'Label2': label2,
                                                'DcRaw_og': dcRaw,
                                                'DcRaw': ' '.join(words_per_offset)}
                        else:
                            row_adjusted = {'DcOffset': offset,
                                            'Label1': label1,
                                            'DcRaw': ' '.join(words_per_offset)}

                        dfs.append(row_adjusted)
            # If it's only 1 offset leave it as is in the df and deal with it in the second pass(save the indicies and append it in the end)
            else:
                orignal_indicies.append(index)
                
    # dataframe with all the adjusted rows that have ;
    result_df = pd.DataFrame(dfs)

    # Add the 1 offset rows
    for index in orignal_indicies:
        result_df = pd.concat([result_df, raw_data_frame.iloc[index].to_frame().T] , ignore_index=True)

    return result_df

In [296]:
result_df = splitting_multiple_offsets(positionings_6_columns_clean)
result_df

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw_og,DcRaw
0,14,wsj_1424,4740..4744,Conjunction,,both and,both
1,14,wsj_1424,4790..4793,Conjunction,,both and,and
2,14,wsj_1457,4597..4612,Contrast,,on the one hand on the other,on the one hand
3,14,wsj_1457,4672..4684,Contrast,,on the one hand on the other,on the other
4,14,wsj_1467,4780..4786,Disjunction,,either or,either
...,...,...,...,...,...,...,...
1733,24,wsj_2428,7083..7090,Level-of-detail,,in that,in that
1734,24,wsj_2428,8475..8486,Conjunction,,in addition,in addition
1735,24,wsj_2417,807..818,Concession,,even though,even though
1736,24,wsj_2417,3986..3997,Conjunction,,in addition,in addition


In [297]:
# step 2 split the DcRaw column

# now you have a dataframe, for each row you only have 1 offset but can have multiple consecutive connectors within this offset
# break it down even more to have one word in DcRaw per row.
            
# subsequent connectors case, adjust the offseet such that num1..num1 + len(word) and num1 + len(word)+1..num2 +...
# Function to break down entries with more than one word.
def break_down_row(row, is_cz = False):
    if len(row['DcRaw'].split()) > 1:
        words = row['DcRaw'].split()
        start_offset = int(row['DcOffset'].split('..')[0])
        end_offset = int(row['DcOffset'].split('..')[1])
        offset_increment = (end_offset - start_offset) // len(words)
        new_rows = []
        current_offset = start_offset
        for word in words:
            if not is_cz:
                new_rows.append({
                    'wsj_section': row['wsj_section'],
                    'wsj_doc': row['wsj_doc'],
                    'DcOffset': f"{current_offset}..{current_offset + len(word)}",
                    'Label1': row['Label1'],
                    'Label2': row['Label2'],
                    'DcRaw_og': row['DcRaw_og'],
                    'DcRaw': word
                })
            else:
                new_rows.append({
                    'DcOffset': f"{current_offset}..{current_offset + len(word)}",
                    'Label1': row['Label1'],
                    'DcRaw': word
                })
                
            current_offset += len(word) + 1
        return new_rows
    else:
        return [row.to_dict()]

# Apply the function to each row and concatenate the results
new_rows = []
for _, row in result_df.iterrows():
    new_rows.extend(break_down_row(row))

# Create a new DataFrame with the modified rows
splitted_rows_df = pd.DataFrame(new_rows)

# Reset index
splitted_rows_df.reset_index(drop=True, inplace=True)
splitted_rows_df.head()

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw_og,DcRaw
0,14,wsj_1424,4740..4744,Conjunction,,both and,both
1,14,wsj_1424,4790..4793,Conjunction,,both and,and
2,14,wsj_1457,4597..4599,Contrast,,on the one hand on the other,on
3,14,wsj_1457,4600..4603,Contrast,,on the one hand on the other,the
4,14,wsj_1457,4604..4607,Contrast,,on the one hand on the other,one


In [298]:
# step 3 replace rows that got splitted with the splitted rows
def replace_splitted_rows(og_df, new_splitted_rows, is_cz = False):
    # Drop the rows that were extracted for splitting
    for index, row in og_df.iterrows():
        # Check if the 'DcRaw' column contains a space
        if ' ' in row['DcRaw']:
            # Remove the row
            og_df.drop(index, inplace=True)
    # Reset the index after removing rows
    og_df.reset_index(drop=True, inplace=True)
    # Concatenate the splitted_rows_df with positionings_6_columns_clean
    og_df = pd.concat([og_df, new_splitted_rows], ignore_index=True)
    # Sort the DataFrame by 'wsj_doc'
    if not is_cz:
        og_df.sort_values(by='wsj_doc', inplace=True)
    # Reset the index
    og_df.reset_index(drop=True, inplace=True)
    return og_df

In [299]:
positionings_6_columns_clean = replace_splitted_rows(positionings_6_columns_clean, splitted_rows_df)
positionings_6_columns_clean

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw,DcRaw_og
0,0,wsj_0003,3522..3525,Conjunction,,and,and
1,0,wsj_0003,3904..3907,Concession,,but,but
2,0,wsj_0003,1440..1443,Conjunction,,and,and
3,0,wsj_0003,534..542,Contrast,,although,although
4,0,wsj_0003,284..288,Cause+Belief,Level-of-detail,with,with
...,...,...,...,...,...,...,...
26308,24,wsj_2454,558..561,Conjunction,,and,and
26309,24,wsj_2454,469..471,Synchronous,,as,as
26310,24,wsj_2454,465..468,Conjunction,,and,and
26311,24,wsj_2454,4628..4633,Purpose,,order,in order


## validity check


In [300]:
#result_df[["wsj_doc","DcOffset", "DcRaw"]].head(50)
filtered_df = positionings_6_columns[positionings_6_columns['DcRaw'].apply(lambda x: len(x.split(" ")) == 4)]
filtered_df

Unnamed: 0,wsj_section,wsj_doc,DcOffset,Label1,Label2,DcRaw
700,14,wsj_1464,1907..1923,Temporal.Synchronous,,at the same time
1022,7,wsj_0748,1286..1302,Temporal.Synchronous,,at the same time
1087,7,wsj_0742,7600..7617,Comparison.Contrast,,on the other hand
1176,7,wsj_0799,5809..5817;5878..5886,Expansion.Conjunction,,not only but also
1223,7,wsj_0764,3031..3048,Comparison.Contrast,,on the other hand
...,...,...,...,...,...,...
22766,16,wsj_1634,5097..5114,Comparison.Contrast,,on the other hand
23192,16,wsj_1642,276..293,Comparison.Contrast,,on the other hand
23355,16,wsj_1613,758..775,Comparison.Contrast,,on the other hand
23383,16,wsj_1693,989..1005,Temporal.Synchronous,,at the same time


In [306]:
unique_labels = positionings_6_columns_clean['Label1'].unique()
# Create label_map
label_map = {label: idx + 1 for idx, label in enumerate(unique_labels)}
print(label_map)

{'Conjunction': 1, 'Concession': 2, 'Contrast': 3, 'Cause+Belief': 4, 'Asynchronous': 5, 'Cause': 6, 'Instantiation': 7, 'Condition': 8, 'Synchronous': 9, 'Substitution': 10, 'Disjunction': 11, 'Purpose': 12, 'Level-of-detail': 13, 'Manner': 14, 'Negative-condition': 15, 'Concession+SpeechAct': 16, 'Condition+SpeechAct': 17, 'Similarity': 18, 'Exception': 19, 'Equivalence': 20, 'Cause+SpeechAct': 21}


# Parsing the file

In [411]:
# sort the tokens and ids lists to training or testing or dev set depending on the folder number
def sort_data(folder_num, tokens_list, id_list, train_tokens, train_ids, test_tokens, test_ids, dev_tokens, dev_ids):
    folder_num = int(folder_num)
    if folder_num == 22:
        dev_tokens.append(tokens_list)
        dev_ids.append(id_list)
    elif folder_num == 23:
        test_tokens.append(tokens_list)
        test_ids.append(id_list)
    else:
        train_tokens.append(tokens_list)
        train_ids.append(id_list)

In [412]:
def convert_to_jsonl(tokens_list, ids_list, output_file):
    # Ensure the lengths of tokens_list and ids_list are the same
    if len(tokens_list) != len(ids_list):
        raise ValueError("The lengths of tokens_list and ids_list must be the same.")
    
    # Open the output file in write mode
    with open(output_file, 'w') as file:
        for tokens, ids in zip(tokens_list, ids_list):
                # Create a dictionary for each entry
                entry = {"tokens": tokens, "ner_tags": ids}
                # Convert the dictionary to a JSON string and write it to the file
                file.write(json.dumps(entry) + '\n')

In [413]:
def convert_to_json(tokens_list, ids_list, output_file):
    # Ensure the lengths of tokens_list and ids_list are the same
    if len(tokens_list) != len(ids_list):
        raise ValueError("The lengths of tokens_list and ids_list must be the same.")
    
    # Create a list to hold all entries
    entries = []
    for tokens, ids in zip(tokens_list, ids_list):
        # Create a dictionary for each entry
        entry = {"tokens": tokens, "ner_tags": ids}
        # Append each entry to the list
        entries.append(entry)
    
    # Save all entries as a JSON array in the output file
    with open(output_file, 'w') as file:
        json.dump(entries, file, indent=4)

In [414]:
# 0s and multiple labels for relation classification task
def check_offset_relation_recognition_task(word, offsets_labels, label_map, word_offset):
    # Check if the word is within any offset
    flag = 0
    for offset, label in offsets_labels:
        start, end = map(int, offset.split('..'))
        # adjust offset if there is punctuation
        if word[-1] in string.punctuation and flag == 0:
            #print(word_offset)
            #print(word)
            flag = 1
            word_offset = word_offset - 1
        if end == word_offset:
            # If the word is within an offset, replace it with the corresponding label
            if flag == 1:
                flag = 0
            return label_map[label]
    # If not within offset, replace the word with 0
    return 0

In [415]:
# only 0s and 1s for only connective detection task
def check_offset_connevtive_detection_task(word, offsets_labels, label_map, word_offset):
    # Check if the word is within any offset
    flag = 0
    for offset, label in offsets_labels:
        start, end = map(int, offset.split('..'))
        # adjust offset if there is punctuation
        if word[-1] in string.punctuation and flag == 0:
            #print(word_offset)
            #print(word)
            flag = 1
            word_offset = word_offset - 1
        if end == word_offset:
            # If the word is within an offset, replace it with the corresponding label
            if flag == 1:
                flag = 0
            return 1
    # If not within offset, replace the word with 0
    return 0

In [416]:
def parse_text(text, offsets_labels):
    processed_text = []
    current_word = ''  
    token_list =[]
    for current_position, char in enumerate(text):
        if char == ' ':  # Check if the character is a space(word delimiter)                
            # Process the current word
            if current_word != '':
                processed_text.append(check_offset_relation_recognition_task(current_word, offsets_labels, label_map, current_position))
                token_list.append(current_word)
            current_word = ''
            
        elif char == "\n": # another delimiter but we include it in our tokens list with id = 0
            if current_word != '':
                processed_text.append(check_offset_relation_recognition_task(current_word, offsets_labels, label_map, current_position))
                token_list.append(current_word)
                
            current_word = ''
            processed_text.append(0)
            token_list.append("\n")
            
        else:
            current_word += char
    # Process the last word 
    if current_word:
        processed_text.append(check_offset_relation_recognition_task(current_word, offsets_labels, label_map, current_position))
        token_list.append(current_word)
    return processed_text, token_list

In [417]:
    base_dir = './PDTB3.0_raw'
    # files_list to keep track of the iterated files
    files_list = [positionings_6_columns_clean["wsj_doc"][0]]
    # tuples of connectors offsets with their coressponding label
    connector_offsets_label = []
    folder_name = f"{positionings_6_columns_clean['wsj_section'][0]:02d}"
    file_name = positionings_6_columns_clean['wsj_doc'][0]
    folder_path = os.path.join(base_dir, folder_name)
    file_path = os.path.join(folder_path, file_name)
    count = 0
    
    # lists to store all the processed text and their respective tokens
    all_tokens_list = []
    all_parsed_text_list = []
    train_tokens = []
    train_ids = []
    test_tokens = []
    test_ids = []
    dev_tokens = []
    dev_ids = []
    
    for index, row in positionings_6_columns_clean.iterrows():
        file_name = row['wsj_doc']
        if file_name not in files_list:
            #if new file then we need to parse the previous one using the offset-label tuples we have.
            if os.path.exists(file_path):
                with open(file_path, 'r', encoding='latin-1') as file:
                    # Parse the text
                    text = file.read()
                    parsed_text, tokens_list = parse_text(text, connector_offsets_label)
                    all_tokens_list.append(tokens_list)
                    all_parsed_text_list.append(parsed_text)
                    # split and sort the data
                    sort_data(folder_name, tokens_list, parsed_text, train_tokens, train_ids, test_tokens, test_ids, dev_tokens, dev_ids)
                    # Reset the list for the new file
                    connector_offsets_label = []
                    connector_offsets_label.append((row['DcOffset'], row["Label1"]))
                    files_list.append(file_name)
            else:
                print(f"File not found: {file_path}")
                
            folder_name = f"{row['wsj_section']:02d}"
            folder_path = os.path.join(base_dir, folder_name)
            file_path = os.path.join(folder_path, file_name)
        else:
            # if iterating over the same file keep track of the offsets and their corresponding labels
            if not pd.isna(row['Label2']):
                # use label1_label2
                #connector_offsets_label.append((row['DcOffset'], f"{row['Label1']}_{row['Label2']}"))
                
                # use only label1 as the model only works on int ids cant use label1_label2
                connector_offsets_label.append((row['DcOffset'], f"{row['Label1']}"))
            else:
                connector_offsets_label.append((row['DcOffset'], f"{row['Label1']}"))

# last File parsing
if os.path.exists(file_path):
        with open(file_path, 'r', encoding='latin-1') as file:
            # Parse the text
            text = file.read()
            parsed_text, tokens_list = parse_text(text, connector_offsets_label)
            all_tokens_list.append(tokens_list)
            all_parsed_text_list.append(parsed_text)
            sort_data(folder_name, tokens_list, parsed_text, train_tokens, train_ids, test_tokens, test_ids, dev_tokens, dev_ids)

else:
        print(f"Last file not found: {file_path}")

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6


## Partition articles into sentences by using line breaks

In [353]:
# Function to segment lists based on a line break
def segment_lists(tokens, ids):
    flat_tokens = [token for sublist in tokens for token in sublist]
    flat_ids = [id for sublist in ids for id in sublist]
    token_segments = []
    id_segments = []
    current_tokens = []
    current_ids = []
    for token, id in zip(flat_tokens, flat_ids):
        if token == "\n":
            if current_tokens:  
                token_segments.append(current_tokens)
                id_segments.append(current_ids)
                current_tokens = []
                current_ids = []
        else:
            current_tokens.append(token)
            current_ids.append(id)
    # Add the last segment if there's any
    if current_tokens:
        token_segments.append(current_tokens)
        id_segments.append(current_ids)

    return token_segments, id_segments

In [422]:
train_tokens_sentences, train_ids_sentences = segment_lists(train_tokens, train_ids)
test_tokens_sentences, test_ids_sentences = segment_lists(test_tokens, test_ids)
dev_tokens_sentences, dev_ids_sentences = segment_lists(dev_tokens, dev_ids)

# Statistics about the English dataset

## Map of Relation --> Connector

In [72]:
result = positionings_6_columns_unsplitted.groupby('Label1')['DcRaw'].value_counts().reset_index(name='count')
# Export to CSV file
result.to_csv('Labels_stats_en(relation --> connector).csv', index=False)

result

Unnamed: 0,Label1,DcRaw,count
0,Arg1-as-cond,and,22
1,Arg1-as-cond,then,1
2,Arg1-as-denier,although,205
3,Arg1-as-denier,while,201
4,Arg1-as-denier,though,91
...,...,...,...
359,Synchronous,however,1
360,Synchronous,if,1
361,Synchronous,if and when,1
362,Synchronous,in the meanwhile,1


## Map Connector ---> Relation

In [46]:
def append_dict_token(token_dict, token, identifier):
     # If the token is not already in the token_counts dictionary, add it
    if token not in token_counts:
        token_counts[token] = {}
    # Increment the count for the corresponding identifier
    if identifier not in token_counts[token]:
        token_counts[token][identifier] = 1
    else:
        token_counts[token][identifier] += 1

In [448]:
# Initialize a defaultdict to store counts
token_counts = defaultdict(lambda: defaultdict(int))
unique_connectors = positionings_6_columns_clean["DcRaw"].unique()

# Initialize a variable to store tokens
token_buffer = []
# Iterate through tokens and identifiers simultaneously
for tokens, parsed_text in zip(all_tokens_list, all_parsed_text_list):
    for token, identifier in zip(tokens, parsed_text):
         # Check if the token is in the set of unique tokens
        if token in unique_connectors:
            # check if we have consecutive tokens and they have the same identifier then join tokens together
            if token_buffer and identifier == identifier_consecutive:
                # add it in a buffer after checking consecutive case
                token_buffer.append(token)
                identifier_consecutive = identifier
                
            # no buffer so this is a first encountered connector 
            elif not token_buffer:
                token_buffer.append(token)
                identifier_consecutive = identifier
                
            # case for consecutive connectors but with different identifiers
            else:
                # add the token_buffer
                token_previous = ' '.join(token_buffer)
                append_dict_token(token_counts, token_previous, identifier_consecutive)
                token_buffer = []
                # add the current token to a new empty buffer
                token_buffer.append(token)
                identifier_consecutive = identifier
        # non connector token and token_buffer is not empty, then append it to dict
        elif token_buffer:
            token = ' '.join(token_buffer) 
            token_buffer = []
            append_dict_token(token_counts, token, identifier_consecutive)    

for token, counts in token_counts.items():
    print("Special Token:", token)
    for identifier, count in counts.items():
        print("Identifier:", identifier, "Count:", count)

Special Token: of
Identifier: 0 Count: 17940
Identifier: 25 Count: 13
Identifier: 6 Count: 1
Special Token: once
Identifier: 0 Count: 70
Identifier: 5 Count: 47
Identifier: 10 Count: 1
Special Token: to
Identifier: 0 Count: 22248
Special Token: a
Identifier: 0 Count: 13856
Special Token: more than
Identifier: 0 Count: 411
Special Token: is
Identifier: 0 Count: 4872
Identifier: 16 Count: 1
Special Token: the
Identifier: 0 Count: 23086
Special Token: with
Identifier: 4 Count: 8
Identifier: 0 Count: 2868
Identifier: 16 Count: 111
Identifier: 7 Count: 50
Identifier: 9 Count: 7
Identifier: 1 Count: 41
Identifier: 17 Count: 6
Identifier: 11 Count: 2
Identifier: 10 Count: 1
Identifier: 3 Count: 1
Special Token: even
Identifier: 0 Count: 252
Special Token: that
Identifier: 0 Count: 6475
Identifier: 7 Count: 4
Identifier: 12 Count: 1
Identifier: 5 Count: 1
Identifier: 8 Count: 1
Special Token: later
Identifier: 0 Count: 82
Identifier: 8 Count: 77
Special Token: in
Identifier: 0 Count: 11017
Ide

Identifier: 10 Count: 33
Identifier: 26 Count: 1
Identifier: 11 Count: 4
Identifier: 5 Count: 3
Identifier: 14 Count: 1
Identifier: 20 Count: 1
Special Token: by the end of
Identifier: 0 Count: 28
Special Token: both
Identifier: 0 Count: 169
Identifier: 1 Count: 6
Special Token: previously
Identifier: 5 Count: 40
Identifier: 0 Count: 71
Special Token: on that
Identifier: 0 Count: 24
Special Token: as a result of an
Identifier: 0 Count: 2
Special Token: of an
Identifier: 0 Count: 162
Special Token: and by
Identifier: 0 Count: 21
Special Token: as well as to
Identifier: 0 Count: 7
Special Token: still of
Identifier: 0 Count: 1
Special Token: is in a
Identifier: 0 Count: 13
Special Token: one of the next
Identifier: 0 Count: 1
Special Token: is still
Identifier: 0 Count: 65
Special Token: time such an
Identifier: 0 Count: 1
Special Token: for by an
Identifier: 0 Count: 1
Special Token: of such
Identifier: 0 Count: 45
Special Token: is the only
Identifier: 0 Count: 12
Special Token: before

Identifier: 0 Count: 9
Special Token: of that
Identifier: 0 Count: 74
Special Token: in case
Identifier: 10 Count: 4
Special Token: in a case
Identifier: 0 Count: 8
Special Token: on the case by the end of the
Identifier: 0 Count: 1
Special Token: with the same
Identifier: 0 Count: 8
Special Token: therefore
Identifier: 12 Count: 23
Special Token: or less
Identifier: 0 Count: 7
Special Token: in by the end of the
Identifier: 0 Count: 1
Special Token: and in
Identifier: 0 Count: 50
Special Token: is in
Identifier: 0 Count: 77
Special Token: to less than
Identifier: 0 Count: 8
Special Token: from about
Identifier: 0 Count: 26
Special Token: due to
Identifier: 0 Count: 44
Identifier: 7 Count: 1
Special Token: of one
Identifier: 0 Count: 27
Special Token: upon
Identifier: 0 Count: 26
Identifier: 10 Count: 1
Identifier: 11 Count: 3
Identifier: 5 Count: 1
Special Token: as such
Identifier: 0 Count: 1
Special Token: is not the
Identifier: 0 Count: 12
Special Token: at any
Identifier: 0 Count:

Identifier: 0 Count: 2
Special Token: if not the
Identifier: 0 Count: 1
Special Token: not because of
Identifier: 0 Count: 3
Special Token: addition or
Identifier: 0 Count: 1
Special Token: instead of less
Identifier: 0 Count: 1
Special Token: with whether
Identifier: 0 Count: 2
Special Token: on the long
Identifier: 0 Count: 1
Special Token: of fact
Identifier: 0 Count: 2
Special Token: one for
Identifier: 0 Count: 5
Special Token: but because of the
Identifier: 0 Count: 1
Special Token: and less
Identifier: 0 Count: 11
Special Token: is next for
Identifier: 0 Count: 1
Special Token: more and more
Identifier: 0 Count: 6
Special Token: the end
Identifier: 0 Count: 4
Identifier: 1 Count: 1
Identifier: 16 Count: 1
Identifier: 23 Count: 1
Identifier: 12 Count: 1
Identifier: 8 Count: 2
Special Token: result in further
Identifier: 0 Count: 2
Special Token: of both the
Identifier: 0 Count: 9
Special Token: is not the time to
Identifier: 0 Count: 1
Special Token: is finally
Identifier: 0 Coun

Identifier: 0 Count: 5
Special Token: or more that
Identifier: 0 Count: 1
Special Token: is even
Identifier: 0 Count: 6
Special Token: both on
Identifier: 0 Count: 4
Special Token: neither the
Identifier: 0 Count: 6
Special Token: so long that the
Identifier: 0 Count: 1
Special Token: on the other
Identifier: 0 Count: 8
Special Token: in a previously
Identifier: 0 Count: 2
Special Token: a hand in
Identifier: 0 Count: 2
Special Token: much the
Identifier: 0 Count: 7
Special Token: in a given
Identifier: 0 Count: 3
Special Token: is about
Identifier: 0 Count: 23
Special Token: as in
Identifier: 0 Count: 12
Special Token: in more
Identifier: 0 Count: 7
Special Token: whether that is the
Identifier: 0 Count: 1
Special Token: that is the
Identifier: 0 Count: 7
Special Token: a result of the
Identifier: 0 Count: 5
Special Token: to further
Identifier: 0 Count: 15
Special Token: despite an
Identifier: 0 Count: 2
Special Token: that only a
Identifier: 0 Count: 5
Special Token: in more than
Id

Special Token: on that case
Identifier: 0 Count: 1
Special Token: than is
Identifier: 0 Count: 2
Special Token: fact of the matter is
Identifier: 0 Count: 1
Special Token: at that than at the
Identifier: 0 Count: 1
Special Token: in with
Identifier: 0 Count: 7
Special Token: where to
Identifier: 0 Count: 4
Special Token: now that
Identifier: 0 Count: 4
Identifier: 5 Count: 3
Identifier: 11 Count: 6
Identifier: 7 Count: 6
Special Token: not just for
Identifier: 0 Count: 2
Special Token: but for
Identifier: 0 Count: 6
Special Token: is one of
Identifier: 0 Count: 17
Special Token: if the
Identifier: 0 Count: 23
Special Token: and yet
Identifier: 1 Count: 1
Special Token: short of the
Identifier: 0 Count: 5
Special Token: the earlier
Identifier: 0 Count: 6
Special Token: more to
Identifier: 0 Count: 18
Special Token: and the only
Identifier: 0 Count: 1
Special Token: upon by the
Identifier: 0 Count: 2
Special Token: and so on
Identifier: 0 Count: 3
Special Token: or later the
Identifier: 

Identifier: 0 Count: 1
Special Token: result from the
Identifier: 0 Count: 2
Special Token: not matter to
Identifier: 0 Count: 1
Special Token: so long as
Identifier: 10 Count: 2
Special Token: that the more
Identifier: 0 Count: 3
Special Token: just one
Identifier: 0 Count: 6
Special Token: in the case of the
Identifier: 0 Count: 3
Special Token: the same from
Identifier: 0 Count: 1
Special Token: of whether
Identifier: 0 Count: 7
Special Token: that particular
Identifier: 0 Count: 2
Special Token: on a particular
Identifier: 0 Count: 2
Special Token: whatever
Identifier: 0 Count: 11
Identifier: 6 Count: 1
Special Token: end on
Identifier: 0 Count: 1
Special Token: even an
Identifier: 0 Count: 2
Special Token: a while
Identifier: 0 Count: 7
Special Token: on and
Identifier: 0 Count: 4
Special Token: just as
Identifier: 0 Count: 14
Special Token: time with
Identifier: 0 Count: 1
Special Token: that more than
Identifier: 0 Count: 7
Special Token: along with other
Identifier: 0 Count: 2


Special Token: or not for
Identifier: 0 Count: 1
Special Token: as well as other
Identifier: 0 Count: 5
Special Token: at or
Identifier: 0 Count: 4
Special Token: more of the
Identifier: 0 Count: 7
Special Token: that the other
Identifier: 0 Count: 2
Special Token: with neither
Identifier: 0 Count: 1
Special Token: in a case that
Identifier: 0 Count: 1
Special Token: a more than
Identifier: 0 Count: 2
Special Token: case in
Identifier: 0 Count: 5
Special Token: also a
Identifier: 0 Count: 2
Special Token: that time the
Identifier: 0 Count: 1
Special Token: else to
Identifier: 0 Count: 6
Special Token: again that
Identifier: 0 Count: 2
Special Token: for a more
Identifier: 0 Count: 1
Special Token: to end a
Identifier: 0 Count: 8
Special Token: by less than
Identifier: 0 Count: 2
Special Token: time in as
Identifier: 0 Count: 2
Special Token: time in the next
Identifier: 0 Count: 1
Special Token: in by the
Identifier: 0 Count: 1
Special Token: time that
Identifier: 0 Count: 3
Special To

Identifier: 0 Count: 2
Special Token: either to
Identifier: 0 Count: 1
Special Token: a long and
Identifier: 0 Count: 1
Special Token: or that of
Identifier: 0 Count: 1
Special Token: is not for
Identifier: 0 Count: 3
Special Token: much time
Identifier: 0 Count: 2
Special Token: for so long
Identifier: 0 Count: 1
Special Token: to so
Identifier: 0 Count: 1
Special Token: else such
Identifier: 0 Count: 1
Special Token: and in fact the
Identifier: 0 Count: 1
Special Token: but more
Identifier: 0 Count: 2
Special Token: in particular as
Identifier: 0 Count: 1
Special Token: but to
Identifier: 0 Count: 6
Special Token: of the the
Identifier: 0 Count: 4
Special Token: where more than
Identifier: 0 Count: 1
Special Token: any such
Identifier: 0 Count: 6
Special Token: and with a
Identifier: 0 Count: 3
Special Token: case on
Identifier: 0 Count: 2
Special Token: and on an
Identifier: 0 Count: 2
Special Token: that in a
Identifier: 0 Count: 4
Special Token: that not
Identifier: 0 Count: 2
Spe

Special Token: plus or
Identifier: 0 Count: 1
Special Token: from less than
Identifier: 0 Count: 4
Special Token: of a well now
Identifier: 0 Count: 1
Special Token: more next
Identifier: 0 Count: 2
Special Token: but about
Identifier: 0 Count: 2
Special Token: one time for
Identifier: 0 Count: 1
Special Token: now after
Identifier: 0 Count: 1
Special Token: less than at
Identifier: 0 Count: 1
Special Token: well a
Identifier: 0 Count: 1
Special Token: on as much as
Identifier: 0 Count: 2
Special Token: that is in the
Identifier: 0 Count: 1
Special Token: where one
Identifier: 0 Count: 2
Special Token: time when
Identifier: 0 Count: 1
Special Token: such long
Identifier: 0 Count: 2
Special Token: is the same
Identifier: 0 Count: 6
Special Token: is not in the
Identifier: 0 Count: 2
Special Token: is contrary both to
Identifier: 0 Count: 1
Special Token: from the fact that the
Identifier: 0 Count: 4
Special Token: the one that
Identifier: 0 Count: 2
Special Token: in the one
Identifier:

Identifier: 0 Count: 1
Special Token: is due to
Identifier: 0 Count: 6
Special Token: is not just a
Identifier: 0 Count: 1
Special Token: well on a
Identifier: 0 Count: 1
Special Token: any particular
Identifier: 0 Count: 2
Special Token: or for the
Identifier: 0 Count: 1
Special Token: much less the
Identifier: 0 Count: 1
Special Token: or less for
Identifier: 0 Count: 1
Special Token: on more than
Identifier: 0 Count: 3
Special Token: more such
Identifier: 0 Count: 2
Special Token: for example or the
Identifier: 0 Count: 1
Special Token: when when the
Identifier: 0 Count: 1
Special Token: after such
Identifier: 0 Count: 1
Special Token: likewise
Identifier: 1 Count: 2
Special Token: case to
Identifier: 0 Count: 3
Special Token: the case on
Identifier: 0 Count: 1
Special Token: only on one
Identifier: 0 Count: 1
Special Token: again to
Identifier: 0 Count: 2
Special Token: than as
Identifier: 0 Count: 1
Special Token: case that
Identifier: 0 Count: 1
Special Token: in one hand and a
I

Identifier: 0 Count: 1
Special Token: later than
Identifier: 0 Count: 2
Special Token: and less on
Identifier: 0 Count: 1
Special Token: for that matter the
Identifier: 0 Count: 1
Special Token: as previously
Identifier: 0 Count: 4
Special Token: but in no
Identifier: 0 Count: 1
Special Token: less than a particular
Identifier: 0 Count: 1
Special Token: not even
Identifier: 0 Count: 7
Special Token: and ultimately more
Identifier: 0 Count: 1
Special Token: until the end of the
Identifier: 0 Count: 1
Special Token: and when
Identifier: 0 Count: 2
Special Token: to an earlier
Identifier: 0 Count: 1
Special Token: of only the
Identifier: 0 Count: 1
Special Token: such as one
Identifier: 0 Count: 1
Special Token: but rather
Identifier: 0 Count: 1
Special Token: but instead for the
Identifier: 0 Count: 1
Special Token: without due
Identifier: 0 Count: 2
Special Token: in addition to any other
Identifier: 0 Count: 1
Special Token: where that
Identifier: 0 Count: 2
Special Token: and otherwis

Identifier: 0 Count: 1
Special Token: is such that
Identifier: 0 Count: 2
Special Token: in fact is
Identifier: 0 Count: 1
Special Token: along without
Identifier: 0 Count: 1
Special Token: is not at
Identifier: 0 Count: 2
Special Token: for alternative
Identifier: 0 Count: 1
Special Token: just a matter of time
Identifier: 0 Count: 1
Special Token: or otherwise
Identifier: 14 Count: 2
Special Token: than on an
Identifier: 0 Count: 1
Special Token: the same as in a
Identifier: 0 Count: 1
Special Token: that even a
Identifier: 0 Count: 1
Special Token: after only a
Identifier: 0 Count: 2
Special Token: but in the end
Identifier: 2 Count: 1
Special Token: after the fact that
Identifier: 0 Count: 1
Special Token: the matter with a
Identifier: 0 Count: 1
Special Token: in a fact
Identifier: 0 Count: 1
Special Token: in the case of
Identifier: 0 Count: 2
Special Token: in the fact
Identifier: 0 Count: 1
Special Token: only one other
Identifier: 0 Count: 1
Special Token: the case a
Identifie

Identifier: 0 Count: 1
Special Token: no time
Identifier: 0 Count: 1
Special Token: and then on
Identifier: 0 Count: 1
Special Token: subsequently to
Identifier: 0 Count: 1
Special Token: is that by
Identifier: 0 Count: 1
Special Token: both in a
Identifier: 0 Count: 1
Special Token: words or less about
Identifier: 0 Count: 1
Special Token: once in a while
Identifier: 0 Count: 1
Special Token: in one fact
Identifier: 0 Count: 1
Special Token: as for
Identifier: 0 Count: 4
Special Token: both on and
Identifier: 0 Count: 1
Special Token: later with
Identifier: 0 Count: 1
Special Token: and after a
Identifier: 0 Count: 1
Special Token: additionally
Identifier: 1 Count: 1
Special Token: that even as
Identifier: 0 Count: 1
Special Token: both a
Identifier: 0 Count: 3
Special Token: by both the
Identifier: 0 Count: 1
Special Token: that no other
Identifier: 0 Count: 1
Special Token: that the fact that no
Identifier: 0 Count: 1
Special Token: not only with
Identifier: 0 Count: 2
Special Token

Special Token: only at the
Identifier: 0 Count: 1
Special Token: of order to
Identifier: 0 Count: 1
Special Token: order a
Identifier: 0 Count: 1
Special Token: order is one of
Identifier: 0 Count: 1
Special Token: again with the same
Identifier: 0 Count: 1
Special Token: the other order
Identifier: 0 Count: 1
Special Token: at or by a
Identifier: 0 Count: 1
Special Token: any given
Identifier: 0 Count: 1
Special Token: just at the
Identifier: 0 Count: 1
Special Token: of the words
Identifier: 0 Count: 1
Special Token: on that one
Identifier: 0 Count: 1
Special Token: in other less
Identifier: 0 Count: 1
Special Token: time along with a
Identifier: 0 Count: 1
Special Token: time hence
Identifier: 0 Count: 1
Special Token: and finally
Identifier: 0 Count: 1
Special Token: beyond any
Identifier: 0 Count: 1
Special Token: where just
Identifier: 0 Count: 1
Special Token: the fact
Identifier: 0 Count: 2
Special Token: as no
Identifier: 0 Count: 2
Special Token: than that from
Identifier: 0 

Identifier: 0 Count: 1
Special Token: hand from
Identifier: 0 Count: 1
Special Token: but less time to
Identifier: 0 Count: 1
Special Token: earlier by
Identifier: 0 Count: 1
Special Token: not the end of the
Identifier: 0 Count: 1
Special Token: on hand because of a
Identifier: 0 Count: 1
Special Token: both for the
Identifier: 0 Count: 1
Special Token: at just
Identifier: 0 Count: 2
Special Token: no more a
Identifier: 0 Count: 1
Special Token: before in
Identifier: 0 Count: 2
Special Token: for example the
Identifier: 0 Count: 1
Special Token: as long as that for
Identifier: 0 Count: 1
Special Token: like any other
Identifier: 0 Count: 1
Special Token: no more or less than
Identifier: 0 Count: 1
Special Token: in a while the
Identifier: 0 Count: 1
Special Token: case than
Identifier: 0 Count: 1
Special Token: or one or more of
Identifier: 0 Count: 1
Special Token: and that
Identifier: 0 Count: 1
Special Token: in comparison
Identifier: 0 Count: 1
Special Token: where the only
Identi

Identifier: 0 Count: 1
Special Token: in until
Identifier: 0 Count: 1
Special Token: than an earlier one
Identifier: 0 Count: 1
Special Token: a later
Identifier: 0 Count: 1
Special Token: instead of the previously
Identifier: 0 Count: 1
Special Token: is due for a
Identifier: 0 Count: 1
Special Token: after other
Identifier: 0 Count: 1
Special Token: or separately
Identifier: 0 Count: 2
Special Token: due and
Identifier: 0 Count: 1
Special Token: or less of the
Identifier: 0 Count: 1
Special Token: due by
Identifier: 0 Count: 2
Special Token: and of an
Identifier: 0 Count: 1
Special Token: that then
Identifier: 0 Count: 2
Special Token: so much more
Identifier: 0 Count: 1
Special Token: short the
Identifier: 0 Count: 1
Special Token: about the only
Identifier: 0 Count: 2
Special Token: the addition to
Identifier: 0 Count: 1
Special Token: for or
Identifier: 0 Count: 1
Special Token: with then
Identifier: 0 Count: 1
Special Token: or otherwise depending on
Identifier: 0 Count: 1
Specia

Identifier: 0 Count: 1
Special Token: and from other
Identifier: 0 Count: 1
Special Token: the case that no
Identifier: 0 Count: 1
Special Token: that whatever the
Identifier: 0 Count: 1
Special Token: that where
Identifier: 0 Count: 1
Special Token: and accurately than
Identifier: 0 Count: 1
Special Token: than from
Identifier: 0 Count: 2
Special Token: much from
Identifier: 0 Count: 1
Special Token: now is to
Identifier: 0 Count: 1
Special Token: for the time to
Identifier: 0 Count: 1
Special Token: that even after
Identifier: 0 Count: 1
Special Token: at about one
Identifier: 0 Count: 1
Special Token: in contrast to the
Identifier: 0 Count: 1
Special Token: once that
Identifier: 0 Count: 1
Special Token: else is one to
Identifier: 0 Count: 1
Special Token: not yet a
Identifier: 0 Count: 1
Special Token: or as one
Identifier: 0 Count: 1
Special Token: for now to the
Identifier: 0 Count: 1
Special Token: so even
Identifier: 0 Count: 1
Special Token: as on the
Identifier: 0 Count: 1
Sp

Identifier: 0 Count: 1
Special Token: though only a
Identifier: 0 Count: 1
Special Token: still an
Identifier: 0 Count: 1
Special Token: of the time is
Identifier: 0 Count: 1
Special Token: on after
Identifier: 0 Count: 1
Special Token: not as an
Identifier: 0 Count: 1
Special Token: by of
Identifier: 0 Count: 1
Special Token: then of
Identifier: 0 Count: 2
Special Token: is one to
Identifier: 0 Count: 1
Special Token: or long
Identifier: 0 Count: 1
Special Token: only or
Identifier: 0 Count: 1
Special Token: when both the
Identifier: 0 Count: 1
Special Token: well or
Identifier: 0 Count: 1
Special Token: plus any further
Identifier: 0 Count: 1
Special Token: now than in the
Identifier: 0 Count: 1
Special Token: either on the
Identifier: 0 Count: 1
Special Token: or on a
Identifier: 0 Count: 1
Special Token: by such a
Identifier: 0 Count: 1
Special Token: of other more
Identifier: 0 Count: 1
Special Token: on of the
Identifier: 0 Count: 1
Special Token: of rather
Identifier: 0 Count: 1

In [151]:
df_dict = {}
for token, values in token_counts.items():
    identifiers = {}
    for identifier, count in values.items():
        identifiers[identifier] = int(count)
    df_dict[token] = identifiers

stats_english = pd.DataFrame.from_dict(df_dict, orient='index')

# Fill NaN values with 0
stats_english = stats_english.fillna(0)
stats_english = stats_english.astype(int)

# Create a new column by summing all columns except the first one
stats_english['sum_non_zero_tags'] = stats_english.iloc[:, 1:].sum(axis=1)
# Invert the dictionary to get a mapping from values to keys
inverted_dict = {v: k for k, v in label_map.items()}

# Rename the columns using the inverted dictionary
stats_english.rename(columns=inverted_dict, inplace=True)

stats_english.to_csv('Labels_stats_en(Connector ---> Relation).csv', index=True)

stats_english.head(50)

Unnamed: 0,0,Arg1-as-subst,Arg1-as-denier,Succession,Arg2-as-cond,Arg2-as-detail,Reason+Belief,Reason,Arg2-as-instance,Conjunction,...,Arg2-as-excpt,Disjunction,Arg1-as-negCond,Equivalence,Result+Belief,Arg2-as-negCond,Arg1-as-instance,Arg1-as-manner,Arg1-as-excpt,sum_non_zero_tags
of,17907,13,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
once,82,0,0,47,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,48
to,22311,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a,13954,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
more than,409,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
is,4860,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
the,23131,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
with,2857,0,0,0,1,111,8,50,7,41,...,0,0,0,0,0,0,0,0,0,227
even,258,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
that,6590,0,0,1,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,7


# Parsing Deutsch text files

In [153]:
# Auxiliary function to transform the xml file to a dict for easier handling
def xml_to_dict(element):
    result = {}
    if element.attrib:
        result.update({element.tag: element.attrib})
    if element.text and element.text.strip():  # Check if element has non-whitespace text content
        result.update({element.tag: element.text})
    for child in element:
        child_dict = xml_to_dict(child)
        if child.tag in result:
            # If the tag already exists, convert it to a list
            if not isinstance(result[child.tag], list):
                result[child.tag] = [result[child.tag]]
            result[child.tag].append(child_dict)
        else:
            result.update({child.tag: child_dict})
    return result

In [465]:
# loop over all files to get the Deutsch feature vector and targets
# Specify the directory path
folder_path = "./connectives"
# to keep track with statistics
token_count = defaultdict(lambda: defaultdict(int))

deutsch_tokens = []
deutsch_parsed_text = []
# Loop over all files in the folder
for filename in os.listdir(folder_path):
    # Check if the path is a file
    if os.path.isfile(os.path.join(folder_path, filename)):
        file_path = os.path.join(folder_path, filename)
        # Parse the XML file
        tree = ET.parse(file_path)
        root = tree.getroot()
        # Convert XML tree to dictionary
        xml_dict = xml_to_dict(root)
        
        # Extract tokens to a list
        tokens_dict = xml_dict['tokens']["token"]
        tokens_list = [token_dict["token"] for token_dict in tokens_dict]
        deutsch_tokens.append(tokens_list)
        # Get a list of relations
        list_of_relations = xml_dict["relations"]["relation"]
        
        # Initiate the parsed list with all zeroes
        parsed_text = [0] * len(tokens_list)
        # Get the ids of connective tokens
        for entry in list_of_relations:
            # Check if type is "explicit"
            if entry['relation']['type'] == 'explicit':
                # get current entry relation
                pdtb3_sense = entry['relation']['pdtb3_sense'].split('.')[-1]
                
                # Extract the id of the connective token
                connective_id = None
                if 'connective_token' in entry['connective_tokens']:
                    connective_token_entry = entry['connective_tokens']['connective_token']
                    if isinstance(connective_token_entry, list):
                        connective_id = [token['connective_token']['id'] for token in connective_token_entry]
                        for single_id in connective_id:
                            index = int(single_id) - 1
                            parsed_text[index] = label_map[pdtb3_sense]
                            token_count[pdtb3_sense][tokens_list[index].lower()] += 1
                                

                    else:
                        connective_id = entry['connective_tokens']['connective_token']['connective_token']['id']
                        index = int(connective_id) - 1
                        parsed_text[index] = label_map[pdtb3_sense]
                        token_count[pdtb3_sense][tokens_list[index].lower()] += 1
        
        deutsch_parsed_text.append(parsed_text)

In [467]:
for list_of_tokens in deutsch_tokens:
    for i, token in enumerate(list_of_tokens):
        if token == ".":
            print(deutsch_parsed_text[0][i])
            print(token)
    break

0
.
0
.
0
.
0
.
0
.
0
.
0
.
0
.
0
.
0
.
0
.


In [483]:
def split_lists_on_dot(tokens_list, parsed_text_list):
    # Initialize the resulting lists
    new_tokens_list = []
    new_parsed_text_list = []
    
    # Iterate through both lists simultaneously
    for tokens, parsed in zip(tokens_list, parsed_text_list):
        # Temporary lists to hold the current sublist
        temp_tokens = []
        temp_parsed = []
        
        for token, parsed_token in zip(tokens, parsed):
            if token == ".":
                # Append the current sublist and reset temporary lists
                new_tokens_list.append(temp_tokens)
                new_parsed_text_list.append(temp_parsed)
                
                temp_tokens = []
                temp_parsed = []
            else:
                # Continue building the current sublist
                temp_tokens.append(token)
                temp_parsed.append(parsed_token)
        
        # If there's any remaining items in temp lists, add them as well
        if temp_tokens:
            new_tokens_list.append(temp_tokens)
            new_parsed_text_list.append(temp_parsed)
    
    return new_tokens_list, new_parsed_text_list

splitted_deutsch_tokens, splitted_parsed_deutsch = split_lists_on_dot(deutsch_tokens, deutsch_parsed_text)

print(splitted_deutsch_tokens[4])
print(splitted_parsed_deutsch[4])

['Und', 'da', 'sieht', 'es', 'immer', 'schlechter', 'aus']
[1, 0, 0, 0, 0, 0, 0]


In [485]:
convert_to_jsonl(splitted_deutsch_tokens, splitted_parsed_deutsch, 'splitted_deutsch_data.jsonl')

## Split Deutsche dataset into training and testing and convert to jsonl format.

In [197]:
def split_data(tokens_list, ids_list, train_ratio=0.6):
    # Ensure the lengths of tokens_list and ids_list are the same
    if len(tokens_list) != len(ids_list):
        raise ValueError("The lengths of tokens_list and ids_list must be the same.")
    # Combine tokens and ids into a single list
    combined_data = list(zip(tokens_list, ids_list))
    # Shuffle the combined data to ensure randomness
    random.shuffle(combined_data)
    
    # Determine the split index
    split_index = int(len(combined_data) * train_ratio)
    
    # Split the data into training and testing sets
    train_data = combined_data[:split_index]
    test_data = combined_data[split_index:]
    # Unzip the training and testing data
    train_tokens, train_ids = zip(*train_data)
    test_tokens, test_ids = zip(*test_data)
    
    return list(train_tokens), list(train_ids), list(test_tokens), list(test_ids)

In [198]:
deutsch_train_tokens, deutsch_train_ids, deutsch_test_tokens, deutsch_test_ids = split_data(deutsch_tokens, deutsch_parsed_text)

In [199]:
# Convert the training data to JSONL
convert_to_jsonl(deutsch_train_tokens, deutsch_train_ids, 'train_deutsch_data.jsonl')

# Convert the testing data to JSONL
convert_to_jsonl(deutsch_test_tokens, deutsch_test_ids, 'test_deutsch_data.jsonl')

In [223]:
#same for french
french_train_tokens, french_train_ids, french_test_tokens, french_test_ids = split_data(all_processed_tokens_list_french, all_conn_list_french)

In [226]:
len(french_test_ids)

959

In [227]:
# Convert the training data to JSONL
convert_to_jsonl(french_train_tokens, french_train_ids, 'train_french_data.jsonl')

# Convert the testing data to JSONL
convert_to_jsonl(french_test_tokens, french_test_ids, 'test_french_data.jsonl')

# Deutsch Statistics
## Map of Relation --> Connector

In [200]:
# Prepare data for DataFrame
data = {'Sense': [], 'Connector Token': [], 'Count': []}
for sense, token_dict in token_count.items():
    for token, count in token_dict.items():
        data['Sense'].append(sense)
        data['Connector Token'].append(token)
        data['Count'].append(count)

# Create a DataFrame from the dictionary
deutsch_stat_df = pd.DataFrame(data)
with open('Deutsch(relation --> connector).txt', 'w') as f:
    f.write(deutsch_stat_df.to_string(index=False))
deutsch_stat_df

Unnamed: 0,Sense,Connector Token,Count
0,Conjunction,und,232
1,Conjunction,auch,30
2,Conjunction,aber,2
3,Conjunction,es,1
4,Conjunction,sondern,7
...,...,...,...
181,Equivalence,worten,1
182,Equivalence,gleichfalls,1
183,Arg1-as-manner,so,1
184,Arg1-as-manner,somit,1


In [158]:
# Initialize a defaultdict to store counts of identifiers for each special token
token_identifier_counts = defaultdict(lambda: defaultdict(int))

# Get unique connector words
unique_connectors = deutsch_stat_df["Connector Token"].unique()

# Iterate over tokens and parsed text
for tokens, parsed_text in zip(deutsch_tokens, deutsch_parsed_text):
    for i, token in enumerate(tokens):
        if token in unique_connectors:
            identifier = parsed_text[i]
            token_identifier_counts[token][identifier] += 1

# Print the counts for each special token and its corresponding identifiers
for token, identifier_counts in token_identifier_counts.items():
    print(f"Special Token: {token}")
    for identifier, count in identifier_counts.items():
        print(f"Identifier: {identifier}, Count: {count}")
token_identifier_counts

Special Token: und
Identifier: 1, Count: 154
Identifier: 0, Count: 363
Identifier: 8, Count: 7
Identifier: 3, Count: 3
Identifier: 16, Count: 1
Special Token: dessen
Identifier: 0, Count: 6
Identifier: 13, Count: 1
Identifier: 12, Count: 1
Special Token: mit
Identifier: 0, Count: 222
Special Token: zu
Identifier: 0, Count: 363
Identifier: 15, Count: 6
Special Token: da
Identifier: 0, Count: 30
Identifier: 7, Count: 3
Special Token: es
Identifier: 0, Count: 232
Identifier: 1, Count: 1
Special Token: inzwischen
Identifier: 0, Count: 9
Identifier: 11, Count: 1
Special Token: auch
Identifier: 0, Count: 180
Identifier: 1, Count: 21
Identifier: 6, Count: 5
Identifier: 2, Count: 1
Identifier: 29, Count: 1
Special Token: als
Identifier: 0, Count: 137
Identifier: 11, Count: 9
Identifier: 16, Count: 1
Special Token: noch
Identifier: 0, Count: 120
Identifier: 1, Count: 2
Special Token: aber
Identifier: 2, Count: 59
Identifier: 1, Count: 1
Identifier: 3, Count: 7
Identifier: 0, Count: 9
Identifier

defaultdict(<function __main__.<lambda>()>,
            {'und': defaultdict(int, {1: 154, 0: 363, 8: 7, 3: 3, 16: 1}),
             'dessen': defaultdict(int, {0: 6, 13: 1, 12: 1}),
             'mit': defaultdict(int, {0: 222}),
             'zu': defaultdict(int, {0: 363, 15: 6}),
             'da': defaultdict(int, {0: 30, 7: 3}),
             'es': defaultdict(int, {0: 232, 1: 1}),
             'inzwischen': defaultdict(int, {0: 9, 11: 1}),
             'auch': defaultdict(int, {0: 180, 1: 21, 6: 5, 2: 1, 29: 1}),
             'als': defaultdict(int, {0: 137, 11: 9, 16: 1}),
             'noch': defaultdict(int, {0: 120, 1: 2}),
             'aber': defaultdict(int, {2: 59, 1: 1, 3: 7, 0: 9, 6: 2}),
             'auf': defaultdict(int, {0: 179}),
             'zumal': defaultdict(int, {7: 3, 16: 1}),
             'um': defaultdict(int, {0: 70, 15: 16, 26: 1}),
             'deshalb': defaultdict(int, {0: 2, 12: 8}),
             'wenn': defaultdict(int, {10: 49, 11: 4, 0: 2, 6: 11,

In [159]:
# Create DataFrame
df_dict = {}
for token, values in token_identifier_counts.items():
    identifiers = {}
    for identifier, count in values.items():
        identifiers[identifier] = int(count)
    df_dict[token] = identifiers
stats_df = pd.DataFrame.from_dict(df_dict, orient='index')

# Fill NaN values with 0
stats_df = stats_df.fillna(0)
stats_df = stats_df.astype(int) 

# Export to plain text file
with open('Labels_stats(Connector ---> Relation).txt', 'w') as f:
    f.write(stats_df.to_string(index=True))
    
stats_df

Unnamed: 0,1,0,8,3,16,13,12,15,7,11,...,10,17,14,31,9,30,5,22,18,25
und,154,363,7,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
es,1,232,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
auch,21,180,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
noch,2,120,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aber,1,9,0,7,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
z.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
nachdem,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
desto,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
andernfalls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Parsing Czech text files

In [113]:
def clean_dataframe(df):
    # Remove the columns with all nan values
    positionings_cz_cleaned = df.dropna(axis=1, how='all')
    # only select explicit connectors 
    positionings_cz_cleaned = positionings_cz_cleaned[positionings_cz_cleaned.iloc[:, 0] == "Explicit"].reset_index(drop=True)
    # only select and rename the 3 columns we are concerned with {1:offset, 7:connector word, 8: relation Label}
    positionings_cz_cleaned = positionings_cz_cleaned[[1,7,8]]
    positionings_cz_cleaned.rename(columns={1: "DcOffset", 7: "DcRaw", 8:"Label1"}, inplace=True)
    # Drop rows where DcOffset or DcRaw is not present
    positionings_cz_cleaned = positionings_cz_cleaned[(positionings_cz_cleaned['DcOffset'] != '') & (positionings_cz_cleaned['DcRaw'] != '')]
    return positionings_cz_cleaned

In [114]:
df = pd.read_csv("./Czech_data_folder/data/column/gold/03/mf920922_016",delimiter='|', header=None, error_bad_lines=False)
positionings_cz_cleaned = clean_dataframe(df)
positionings_cz_cleaned



  df = pd.read_csv("./Czech_data_folder/data/column/gold/03/mf920922_016",delimiter='|', header=None, error_bad_lines=False)
b'Skipping line 2: expected 44 fields, saw 46\n'


Unnamed: 0,DcOffset,DcRaw,Label1
0,342..348,přitom,Expansion.Level-of-detail.Arg2-as-detail
1,1172..1177,proto,Contingency.Cause.Result
2,1076..1080,však,Comparison.Concession.Arg2-as-denier
3,1398..1399,a,Expansion.Conjunction
4,1671..1676,spíše,Expansion.Substitution.Arg2-as-subst


In [148]:
# Iterate over all CSV files in every folder and parse the corresponding file in the raw text
directory = './Czech_data_folder/data/column/gold'
for root, dirs, files in os.walk(directory):
    # Iterate over all files in the current directory
    empty_files_count = 0
    for file in files:
        if file == '.DS_Store':
            continue
        # Join the current directory path with the file name to get the full file path
        file_path = os.path.join(root, file)
        print(file_path)
        #positionings_cz = pd.read_csv(file_path, delimiter='|', header=None, on_bad_lines='skip')
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            if text == "":
                empty_files_count += 1
                continue
            # Split the string into rows based on newline character
            rows = text.strip().split('\n')
            # Split each row into columns based on the delimiter '|'
            data = [row.split('|') for row in rows]
            positionings_cz = pd.DataFrame(data)
        positionings_cz_cleaned = clean_dataframe(positionings_cz) 
        
        # same processing like we did for the English
        result_cz = splitting_multiple_offsets(positionings_cz_cleaned, is_cz= True)
        #Apply the function to each row and concatenate the results
        new_rows = []
        for _, row in result_cz.iterrows():
            new_rows.extend(break_down_row(row, is_cz = True))
            
        # Create a new DataFrame with the modified rows
        splitted_rows_cz = pd.DataFrame(new_rows)
        # Reset index
        splitted_rows_cz.reset_index(drop=True, inplace=True)
        positionings_cz_cleaned = replace_splitted_rows(positionings_cz_cleaned, splitted_rows_cz,is_cz=True)
        
        # Get the corresponding raw text file in the 
        raw_file_path = file_path.replace('gold', 'raw')
        print(raw_file_path)
        with open(raw_file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            # get the offsets- labels list of tuples
            connector_offsets_label_cz = list(zip(positionings_cz_cleaned['DcOffset'], positionings_cz_cleaned['Label1'].str.split('.').str[-1]))
            print(connector_offsets_label_cz)
            print(positionings_cz_cleaned)
            parsed,tokens = parse_text(text, connector_offsets_label_cz)
            print(parsed)
            print(tokens)
            for i, num in enumerate(parsed):
                if num != 0:
                    print(tokens[i])
            print(positionings_cz_cleaned)

./Czech_data_folder/data/column/gold/03/ln95045_002
./Czech_data_folder/data/column/raw/03/ln95045_002
[('190..191', 'Conjunction'), ('488..489', 'Conjunction'), ('521..527', 'Contrast'), ('610..611', 'Conjunction')]
   DcOffset   DcRaw                 Label1
0  190..191       a  Expansion.Conjunction
1  488..489       a  Expansion.Conjunction
2  521..527  naopak    Comparison.Contrast
3  610..611       a  Expansion.Conjunction
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Saldo', 'zahraničního', 'obchodu', 'se', 'loni', 'propadlo', 'do', 'pasívaGRAFPraha', 'Obrat', 'zahraničního', 'obchodu', 'se', 'loni', 'zvýši

      DcOffset    DcRaw                                    Label1
0     753..758    totiž  Expansion.Level-of-detail.Arg1-as-detail
1     809..810        :  Expansion.Level-of-detail.Arg2-as-detail
2   1275..1276        a                     Expansion.Conjunction
3   1532..1536     však                       Comparison.Contrast
4   1441..1444      kdy                  Contingency.Cause.Reason
5   1983..1988    totiž                  Contingency.Cause.Reason
6   2616..2620     také                     Expansion.Conjunction
7   2788..2795  protože                  Contingency.Cause.Reason
8   3144..3148     tedy                  Contingency.Cause.Reason
9   3262..3269  protože                  Contingency.Cause.Reason
10  3306..3309      -li        Contingency.Condition.Arg2-as-cond
11  3370..3374     však                       Comparison.Contrast
12  3416..3417        a                     Expansion.Conjunction
13  3562..3566     tedy  Expansion.Level-of-detail.Arg1-as-detail
14    136.

./Czech_data_folder/data/column/raw/03/mf930709_118
[('125..130', 'Result'), ('400..404', 'Result'), ('440..441', 'Conjunction'), ('711..712', 'Conjunction'), ('775..778', 'Arg2-as-cond'), ('886..887', 'Conjunction'), ('888..893', 'Condition+SpeechAct'), ('1174..1175', 'Conjunction'), ('1485..1488', 'Arg2-as-cond'), ('1786..1789', 'Conjunction'), ('2040..2041', 'Conjunction'), ('2042..2049', 'Reason'), ('2321..2322', 'Conjunction'), ('2633..2634', 'Conjunction'), ('2727..2731', 'Reason'), ('3054..3057', 'Arg2-as-goal'), ('3085..3086', 'Conjunction'), ('3198..3199', 'Conjunction'), ('3379..3380', 'Conjunction'), ('3547..3552', 'Result'), ('3578..3579', 'Conjunction'), ('3604..3607', 'Arg2-as-goal'), ('3931..3936', 'Result'), ('3822..3826', 'Equivalence'), ('2161..2167', 'Reason'), ('2168..2170', 'Reason'), ('3758..3760', 'Arg2-as-cond'), ('3761..3767', 'Arg2-as-cond'), ('3768..3770', 'Arg2-as-cond'), ('4095..4096', 'Arg2-as-denier'), ('4097..4099', 'Arg2-as-denier')]
      DcOffset    D

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

24  2528..2531      tak      Comparison.Concession.Arg2-as-denier
hey
Profit [('193..194', 'Reason+Belief'), ('222..223', 'Conjunction'), ('258..259', 'Conjunction'), ('341..342', 'Arg2-as-detail'), ('403..407', 'Contrast'), ('491..492', 'Conjunction'), ('559..563', 'Arg2-as-denier'), ('764..765', 'Arg2-as-detail'), ('862..867', 'Result'), ('1152..1157', 'Result'), ('958..963', 'Arg2-as-cond'), ('1092..1099', 'Reason'), ('1464..1465', 'Conjunction'), ('1959..1960', 'Conjunction'), ('3000..3001', 'Conjunction'), ('4037..4038', 'Conjunction'), ('4227..4234', 'Reason'), ('885..889', 'Arg2-as-denier'), ('905..908', 'Arg2-as-denier'), ('3176..3178', 'Arg2-as-subst'), ('3233..3239', 'Arg2-as-subst'), ('683..686', 'Result'), ('687..692', 'Result'), ('2526..2527', 'Arg2-as-denier'), ('2528..2531', 'Arg2-as-denier')]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,

./Czech_data_folder/data/column/raw/03/ln95046_067
[('190..193', 'Arg2-as-denier'), ('500..505', 'Reason'), ('666..667', 'Conjunction'), ('931..936', 'Contrast'), ('994..997', 'Arg1-as-denier'), ('1115..1119', 'Reason'), ('1120..1122', 'Reason')]
     DcOffset  DcRaw                                Label1
0    190..193    ale  Comparison.Concession.Arg2-as-denier
1    500..505  totiž              Contingency.Cause.Reason
2    666..667      a                 Expansion.Conjunction
3    931..936  zatím                   Comparison.Contrast
4    994..997    byť  Comparison.Concession.Arg1-as-denier
5  1115..1119   tím,              Contingency.Cause.Reason
6  1120..1122     že              Contingency.Cause.Reason
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0

./Czech_data_folder/data/column/raw/03/ln94211_37
[('45..52', 'Arg1-as-denier'), ('162..169', 'Contrast'), ('615..620', 'Arg2-as-denier'), ('1109..1116', 'Synchronous'), ('1196..1200', 'Arg2-as-denier'), ('1244..1248', 'Result'), ('1363..1368', 'Arg2-as-cond'), ('1436..1441', 'Reason'), ('1600..1605', 'Reason')]
     DcOffset    DcRaw                                Label1
0      45..52  ačkoliv  Comparison.Concession.Arg1-as-denier
1    162..169  zatímco                   Comparison.Contrast
2    615..620    ovšem  Comparison.Concession.Arg2-as-denier
3  1109..1116  zároveň                  Temporal.Synchronous
4  1196..1200     však  Comparison.Concession.Arg2-as-denier
5  1244..1248     tedy              Contingency.Cause.Result
6  1363..1368    pokud    Contingency.Condition.Arg2-as-cond
7  1436..1441    neboť              Contingency.Cause.Reason
8  1600..1605    totiž              Contingency.Cause.Reason
[0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Stranickému', 'tisku', 'spíše', 'NEPředstavitelé', 'politických', 'stran', 'se', 'staví', 'většinou', 'proti', 'vydávání', 'vlastních', 'novinPraha', 'mrk', 'em', 'ODA', 'podle', 'svého', 'místopře

./Czech_data_folder/data/column/raw/03/ln94203_26
[('43..51', 'Arg1-as-denier'), ('525..529', 'Arg2-as-denier')]
   DcOffset     DcRaw                                Label1
0    43..51  přestože  Comparison.Concession.Arg1-as-denier
1  525..529      však  Comparison.Concession.Arg2-as-denier
[0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Dražší', 'autobusové', 'jízdné', 'pro', 'žákyPraha', 'Přestože', 'se', 'ceny', 'žákovského', 'autobusového', 'jízdného', 'a', 'to', 'jak', 'u', 'pravidelného', 'dojíždění', 'týdenní', 'a', 'měsíční', 'žákovské', 'jízdenky', 'tak', 'pro', 'jednotlivé', 'jízdy', 'zvyšují', 'od', '1', 'září', 'o', 'sto', 'procent', 'zaplatí', 'žáci', 'jen', 'čtvrtinu', 'z', 'celkových', 'nák

8  1556..1557        a                 Expansion.Conjunction
./Czech_data_folder/data/column/gold/03/ln95048_104
./Czech_data_folder/data/column/raw/03/ln95048_104
[('617..620', 'Arg2-as-goal'), ('709..710', 'Conjunction'), ('1234..1241', 'Equivalence'), ('1564..1565', 'Arg2-as-detail'), ('1651..1654', 'Conjunction'), ('1783..1786', 'Arg2-as-denier'), ('1817..1821', 'Disjunction'), ('1951..1952', 'Conjunction'), ('2093..2096', 'Precedence'), ('2284..2291', 'Reason'), ('2487..2488', 'Conjunction'), ('2644..2645', 'Conjunction'), ('2760..2767', 'Conjunction'), ('2976..2985', 'Disjunction'), ('3302..3308', 'Arg1-as-denier'), ('3748..3749', 'Conjunction'), ('3793..3798', 'Arg2-as-cond'), ('3978..3981', 'Arg2-as-denier'), ('4046..4053', 'Reason'), ('4205..4206', 'Conjunction'), ('4457..4458', 'Conjunction'), ('4525..4531', 'Arg1-as-denier'), ('4994..4998', 'Disjunction'), ('5111..5112', 'Conjunction'), ('5313..5321', 'Arg2-as-cond'), ('5367..5368', 'Conjunction'), ('5430..5431', 'Conjunctio

3  1158..1159      a                 Expansion.Conjunction
./Czech_data_folder/data/column/gold/03/ln95045_112
./Czech_data_folder/data/column/raw/03/ln95045_112
[('337..338', 'Conjunction'), ('478..479', 'Conjunction'), ('633..640', 'Contrast'), ('748..751', 'Arg2-as-denier')]
   DcOffset    DcRaw                                Label1
0  337..338        a                 Expansion.Conjunction
1  478..479        a                 Expansion.Conjunction
2  633..640  zatímco                   Comparison.Contrast
3  748..751      ale  Comparison.Concession.Arg2-as-denier
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,

[('488..489', 'Conjunction'), ('577..581', 'Conjunction')]
   DcOffset DcRaw                 Label1
0  488..489     a  Expansion.Conjunction
1  577..581  dále  Expansion.Conjunction
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Přijde', 'nová', 'revoluční', 'vlnaFIDEL', 'CASTRO', 'Přijde', 'nová', 'revoluční', 'vlnaHavana', 'Příchod', 'nové', 'vlny', 'marxistických', 'revolucí', 've', 'světě', 'předpověděl', 'kubánský', 'nejvyšší', 'představitel', 'Fidel', 'Castro', 'v', 'rozhovoru', 'který', 'poskytl', 'svému', 'příteli', 'bývalému', 'nikaragujskému', 'ministru', 'vnitra', 'Tomási', 'Borgemu', 'Castro', 'zdůraznil', 'své', 'přesvědčení', 'že', 'nadvláda', 'reakčních', 'sil', 'v', 'tomto', 'svět

21  1218..1220       to                 Expansion.Conjunction
./Czech_data_folder/data/column/gold/03/ln94200_49
./Czech_data_folder/data/column/raw/03/ln94200_49
[('189..190', 'Conjunction'), ('659..660', 'Conjunction'), ('1026..1033', 'Contrast'), ('1365..1372', 'Reason'), ('1452..1459', 'Succession'), ('1515..1516', 'Arg2-as-detail'), ('1675..1680', 'Conjunction'), ('1924..1927', 'Conjunction'), ('2066..2073', 'Arg1-as-denier'), ('2186..2187', 'Conjunction'), ('2281..2282', 'Conjunction'), ('2366..2367', 'Conjunction'), ('2387..2394', 'Conjunction'), ('2428..2432', 'Arg2-as-denier'), ('2947..2950', 'Result'), ('2900..2903', 'Conjunction'), ('3277..3280', 'Result'), ('628..635', 'Arg2-as-denier'), ('642..647', 'Arg2-as-denier'), ('648..653', 'Arg2-as-denier'), ('3411..3415', 'Arg2-as-cond'), ('3427..3430', 'Arg2-as-cond'), ('3431..3435', 'Arg2-as-cond'), ('666..673', 'Arg1-as-denier'), ('674..676', 'Arg1-as-denier'), ('886..887', 'Result'), ('888..891', 'Result'), ('917..918', 'Conju

./Czech_data_folder/data/column/raw/03/ln94204_70
[('411..416', 'Reason'), ('479..484', 'Result')]
   DcOffset  DcRaw                    Label1
0  411..416  neboť  Contingency.Cause.Reason
1  479..484  takže  Contingency.Cause.Result
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Řidič', 'kradl', 'u', 'EiffelovkyÚstí', 'nad', 'Orlicí', 'jop', 'Od', 'včerejška', 'je', 'stíhán', 'pro', 'krádež', 'osmadvacetiletý', 'R', 'H', 'z', 'Hodonína', 'řidič', 'autobusu', 'jedné', 'cestovní', 'kanceláře', 'z', 'Vysokého', 'Mýta', '25', 'srpna', 'v', 'bezprostřední', 'blízkosti', 'Eiffelovy', 'věže', 'v', 'době', 'kdy', 'turisté', 'obdivovali', 'krásy', 'Paříže', 'prohledal', 'jejich', 'osobní', 'věci', 'v', 'autobuse', 'v', '

./Czech_data_folder/data/column/raw/03/ln95046_107
[('448..449', 'Conjunction'), ('509..513', 'Conjunction'), ('545..553', 'Contrast'), ('961..966', 'Arg1-as-excpt'), ('1067..1075', 'Conjunction'), ('1434..1438', 'Arg2-as-denier'), ('1488..1489', 'Conjunction'), ('1575..1579', 'Arg2-as-detail'), ('2072..2073', 'Conjunction'), ('2234..2239', 'Arg2-as-denier'), ('344..348', 'Arg2-as-denier'), ('406..410', 'Arg2-as-denier'), ('728..731', 'Arg2-as-cond'), ('797..800', 'Arg2-as-cond'), ('1580..1585', 'Conjunction'), ('1586..1588', 'Conjunction'), ('1646..1649', 'Conjunction'), ('1650..1657', 'Conjunction'), ('2253..2259', 'Arg2-as-subst'), ('2299..2302', 'Arg2-as-subst'), ('1809..1810', 'Result'), ('1811..1814', 'Result'), ('2124..2125', 'Arg1-as-detail'), ('2126..2129', 'Arg1-as-detail')]
      DcOffset     DcRaw                                    Label1
0     448..449         a                     Expansion.Conjunction
1     509..513      také                     Expansion.Conjunction
2  

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

./Czech_data_folder/data/column/raw/03/ln94207_94
[('614..617', 'Precedence'), ('548..551', 'Arg2-as-denier'), ('1805..1810', 'Result'), ('668..671', 'Contrast'), ('702..703', 'Arg2-as-detail'), ('805..809', 'Arg2-as-denier'), ('823..824', 'Arg2-as-detail'), ('889..890', 'Conjunction'), ('938..939', 'Conjunction'), ('1072..1076', 'Synchronous'), ('1393..1396', 'Arg2-as-denier'), ('1596..1600', 'Conjunction'), ('1698..1699', 'Conjunction'), ('1895..1900', 'Arg2-as-excpt'), ('1973..1974', 'Conjunction'), ('2071..2072', 'Arg2-as-detail'), ('2154..2155', 'Conjunction'), ('2301..2307', 'Conjunction'), ('2326..2328', 'Reason'), ('1472..1475', 'Conjunction'), ('1487..1491', 'Conjunction'), ('2250..2251', 'Conjunction'), ('2279..2284', 'Conjunction'), ('909..916', 'Arg2-as-subst'), ('917..922', 'Arg2-as-subst'), ('1230..1231', 'Arg1-as-denier'), ('1232..1236', 'Arg1-as-denier'), ('2364..2369', 'Arg2-as-denier'), ('2370..2371', 'Arg2-as-denier'), ('2372..2377', 'Arg2-as-denier')]
      DcOffset

./Czech_data_folder/data/column/raw/03/cmpr9415_059
[('362..369', 'Reason'), ('1306..1307', 'Conjunction'), ('1628..1633', 'Result'), ('2831..2838', 'Reason')]
     DcOffset    DcRaw                    Label1
0    362..369  protože  Contingency.Cause.Reason
1  1306..1307        a     Expansion.Conjunction
2  1628..1633    proto  Contingency.Cause.Result
3  2831..2838  protože  Contingency.Cause.Reason
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

./Czech_data_folder/data/column/raw/03/ln94205_76
[('141..142', 'Conjunction'), ('203..211', 'Conjunction'), ('856..859', 'Arg2-as-goal'), ('1302..1309', 'Precedence'), ('1441..1442', 'Conjunction'), ('1576..1577', 'Conjunction'), ('1593..1598', 'Reason'), ('1706..1710', 'Conjunction'), ('1989..1990', 'Conjunction'), ('1630..1632', 'Arg2-as-subst'), ('1663..1666', 'Arg2-as-subst'), ('1667..1670', 'Arg2-as-subst'), ('409..410', 'Precedence'), ('411..414', 'Precedence'), ('1263..1269', 'Similarity'), ('1270..1273', 'Similarity')]
      DcOffset     DcRaw                                Label1
0     141..142         a                 Expansion.Conjunction
1     203..211  současně                 Expansion.Conjunction
2     856..859       aby      Contingency.Purpose.Arg2-as-goal
3   1302..1309   později      Temporal.Asynchronous.Precedence
4   1441..1442         a                 Expansion.Conjunction
5   1576..1577         a                 Expansion.Conjunction
6   1593..1598     neboť 

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 18, 0]
['KrátceSeriálem', 'dvaceti', 'nahrávek', 'operní', 'hudby', 'na', 'kompaktních', 'deskách', 'a', 'na', 'kazetách', 'určených', 'pro', 'americký', 'trh', 'chce', 'Luciano', 'Pavarotti', 'spolu', 's', 'firmou', 'London', 'Records', 'zpřístupnit', 'operní', 'umění', 'ne

./Czech_data_folder/data/column/raw/03/ln94206_67
[('528..536', 'Condition+SpeechAct'), ('742..745', 'Arg2-as-denier'), ('869..870', 'Conjunction'), ('922..928', 'Conjunction'), ('1116..1119', 'Result'), ('1177..1178', 'Conjunction'), ('1297..1298', 'Conjunction'), ('1530..1531', 'Conjunction'), ('1585..1589', 'Conjunction'), ('1577..1581', 'Condition+SpeechAct'), ('1911..1915', 'Result'), ('2387..2388', 'Conjunction'), ('2534..2535', 'Conjunction'), ('2566..2570', 'Conjunction'), ('2853..2856', 'Arg2-as-denier'), ('2974..2975', 'Conjunction'), ('3097..3098', 'Conjunction'), ('3624..3631', 'Reason'), ('2677..2678', 'Result'), ('2694..2699', 'Result'), ('1789..1790', 'Arg2-as-detail'), ('1791..1797', 'Arg2-as-detail'), ('3594..3596', 'Arg2-as-denier'), ('3597..3602', 'Arg2-as-denier'), ('3603..3609', 'Arg2-as-denier'), ('3902..3903', 'Arg1-as-denier'), ('3904..3908', 'Arg1-as-denier')]
      DcOffset     DcRaw                                    Label1
0     528..536  jestliže           

[('295..296', 'Conjunction')]
   DcOffset DcRaw                 Label1
0  295..296     a  Expansion.Conjunction
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Z', 'fotbalových', 'kruhůJiří', 'Lerch', 'z', 'pražské', 'Slavie', 'se', 'definitivně', 'rozhodl', 'neodcházet', 'do', 'Chebu', 'ale', 'zůstat', 'v', 'pražském', 'klubu', 'Utkání', '3', 'kola', 'II', 'ligy', 'Uherské', 'Hradiště', 'Bohumín', 'se', 'koná', 'už', 'v', 'sobotu', 'od', '16', '30', 'h', 'Miroslav', 'Sovič', 'bývalý', 'juniorský', 'reprezentant', 'z', 'Nitry', 'přichází', 'na', 'hostování', 'do', 'Jablonce', 'a', 'už', 'v', 'nedělním', 'ligovém', 'utkání', 'bude', 'připraven', 'nastoupit', 'ber', 'hk']
a
   DcOffset DcRaw                 Label1
0  295..296     a  Expansion.Conjunction
./Czech_data_folder/data/column/gold/03/ln94209_43
./Czech_data_folder/data/column/gold/03/ln94211_46
./Czec

29  1867..1870       aby                Contingency.Purpose.Arg2-as-goal
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

[0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

[('435..436', 'Conjunction'), ('491..494', 'Result'), ('791..792', 'Conjunction'), ('929..930', 'Conjunction'), ('586..587', 'Conjunction'), ('588..595', 'Conjunction')]
   DcOffset    DcRaw                    Label1
0  435..436        a     Expansion.Conjunction
1  491..494      tak  Contingency.Cause.Result
2  791..792        a     Expansion.Conjunction
3  929..930        a     Expansion.Conjunction
4  586..587        a     Expansion.Conjunction
5  588..595  zároveň     Expansion.Conjunction
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
['Trachta', 'nehlídá', 'hráč

./Czech_data_folder/data/column/raw/03/ln94205_94
[('350..351', 'Conjunction'), ('800..801', 'Conjunction'), ('989..994', 'Arg2-as-denier'), ('1305..1310', 'Reason'), ('1358..1359', 'Conjunction'), ('1599..1602', 'Arg2-as-denier'), ('1615..1616', 'Conjunction'), ('1831..1832', 'Conjunction'), ('1889..1892', 'Arg2-as-goal'), ('1918..1922', 'Disjunction'), ('2101..2102', 'Conjunction'), ('2192..2193', 'Conjunction'), ('2275..2276', 'Conjunction'), ('2326..2327', 'Conjunction'), ('2564..2568', 'Disjunction'), ('2777..2782', 'Reason+Belief'), ('3048..3049', 'Conjunction'), ('3369..3370', 'Conjunction'), ('3634..3636', 'Disjunction'), ('4175..4176', 'Conjunction'), ('4315..4316', 'Conjunction'), ('4345..4346', 'Conjunction'), ('4404..4405', 'Conjunction'), ('4461..4466', 'Reason'), ('4512..4513', 'Conjunction'), ('4590..4591', 'Conjunction'), ('4901..4902', 'Conjunction'), ('5017..5018', 'Conjunction'), ('5154..5155', 'Conjunction'), ('5167..5170', 'Arg2-as-denier'), ('5269..5270', 'Conjunc

./Czech_data_folder/data/column/raw/03/ln95048_016
[('630..638', 'Arg1-as-denier'), ('941..945', 'Arg2-as-denier'), ('1368..1369', 'Conjunction'), ('2215..2220', 'Conjunction'), ('2210..2214', 'Reason+Belief'), ('337..338', 'Reason'), ('339..341', 'Reason'), ('348..353', 'Reason'), ('348..354', 'Reason'), ('355..357', 'Reason'), ('1130..1131', 'Arg2-as-denier'), ('1132..1138', 'Arg2-as-denier')]
      DcOffset     DcRaw                                  Label1
0     630..638  přestože    Comparison.Concession.Arg1-as-denier
1     941..945      však    Comparison.Concession.Arg2-as-denier
2   1368..1369         a                   Expansion.Conjunction
3   2215..2220     navíc                   Expansion.Conjunction
4   2210..2214      když  Contingency.Cause+Belief.Reason+Belief
5     337..338         a                Contingency.Cause.Reason
6     339..341        to                Contingency.Cause.Reason
7     348..353     proto                Contingency.Cause.Reason
8     348..354  

./Czech_data_folder/data/column/raw/03/ln94209_52
[('199..204', 'Arg2-as-cond'), ('583..589', 'Arg2-as-denier'), ('765..772', 'Contrast'), ('1321..1325', 'Disjunction')]
     DcOffset    DcRaw                                Label1
0    199..204    kdyby    Contingency.Condition.Arg2-as-cond
1    583..589   přitom  Comparison.Concession.Arg2-as-denier
2    765..772  zatímco                   Comparison.Contrast
3  1321..1325     nebo                 Expansion.Disjunction
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

[('121..128', 'Synchronous'), ('941..948', 'Precedence'), ('1059..1064', 'Conjunction'), ('2004..2005', 'Arg2-as-detail'), ('1972..1973', 'Conjunction'), ('2279..2280', 'Conjunction'), ('3041..3042', 'Conjunction'), ('3138..3141', 'Arg1-as-denier'), ('2321..2325', 'Arg2-as-denier'), ('2512..2516', 'Arg2-as-denier'), ('2734..2738', 'Arg2-as-denier'), ('2746..2749', 'Arg2-as-denier'), ('3302..3303', 'Arg2-as-denier'), ('3304..3307', 'Arg2-as-denier'), ('3308..3312', 'Arg2-as-denier')]
      DcOffset    DcRaw                                    Label1
0     121..128  zatímco                      Temporal.Synchronous
1     941..948  později          Temporal.Asynchronous.Precedence
2   1059..1064    navíc                     Expansion.Conjunction
3   2004..2005        :  Expansion.Level-of-detail.Arg2-as-detail
4   1972..1973        a                     Expansion.Conjunction
5   2279..2280        a                     Expansion.Conjunction
6   3041..3042        a                     Expans

./Czech_data_folder/data/column/raw/03/ln95049_019
[('513..520', 'Synchronous'), ('536..537', 'Conjunction'), ('607..609', 'Arg2-as-denier'), ('610..615', 'Arg2-as-denier'), ('616..622', 'Arg2-as-denier')]
   DcOffset    DcRaw                                Label1
0  513..520  zároveň                  Temporal.Synchronous
1  536..537        a                 Expansion.Conjunction
2  607..609       na  Comparison.Concession.Arg2-as-denier
3  610..615    druhé  Comparison.Concession.Arg2-as-denier
4  616..622   straně  Comparison.Concession.Arg2-as-denier
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Uhde', 'pro', 'dohodu', 'o', 'katedrálePraha', 'Snad', 'jedině', 'rozumná', 'dohoda', 'respektující', 'fak

ValueError: invalid literal for int() with base 10: '855;862'

In [116]:
for i, num in enumerate(parsed):
    if num != 0:
        print(tokens[i])
positionings_cz_cleaned

a
jestliže
pokud
kdyby
jestliže
dokud
pokud


Unnamed: 0,DcOffset,DcRaw,Label1
0,350..351,a,Expansion.Conjunction
1,800..801,a,Expansion.Conjunction
2,989..994,ovšem,Comparison.Concession.Arg2-as-denier
3,1305..1310,přece,Contingency.Cause.Reason
4,1358..1359,a,Expansion.Conjunction
...,...,...,...
76,2135..2141,přitom,Comparison.Concession.Arg1-as-denier
77,3011..3012,a,Temporal.Asynchronous.Precedence
78,3013..3016,pak,Temporal.Asynchronous.Precedence
79,5563..5564,a,Expansion.Conjunction


In [263]:
positionings_cz_cleaned

Unnamed: 0,DcOffset,DcRaw,Label1
0,450..458,přestože,Comparison.Concession.Arg1-as-denier
1,869..876,nicméně,Comparison.Contrast
2,1091..1100,popřípadě,Expansion.Disjunction
3,1650..1653,pak,Contingency.Cause.Result
4,1298..1302,a to,Expansion.Level-of-detail.Arg2-as-detail
5,1534..1538,však,Comparison.Concession.Arg2-as-denier
6,1582..1587,totiž,Contingency.Cause.Reason
7,1687..1688,a,Expansion.Conjunction
8,2070..2075,a tím,Contingency.Cause.Result
9,2137..2140,pak,Contingency.Cause.Reason


# Parsing French Text files 

In [202]:
def process_text_with_conn_tag(tuple_list):
    article_token_list = []
    article_parsed_list = []
    
    for tup in tuple_list:
        french_text = tup[1]
        tokens = french_text.split()
        multi_conn_flag = 0
        processed_tokens = []
        conn_list = []
        print(tokens)
        for token in tokens:
            # single token case
            if "<CONN>" in token and "</CONN>" in token:
                string_between_tags = re.search(r'<CONN>(.*?)</CONN>', token).group(1)
                processed_tokens.append(string_between_tags)
                conn_list.append(1)
                print(f"single worded connector: {token}")
            # multiworded connector case
            elif "<CONN>" in token:
                text_after_tag = re.search(r'<CONN>(.*?)$', token).group(1)
                processed_tokens.append(text_after_tag)
                conn_list.append(1)
                multi_conn_flag = 1
                print(f"multi worded connector: {token}")
            # still itereating multiworded tokens
            elif multi_conn_flag == 1:
                # mark the end of the multiworded connectors
                if "</CONN>" in token:
                    multi_conn_flag = 0
                    text_before_tag = re.search(r'(.*?)</CONN>', token).group(1)
                    processed_tokens.append(text_before_tag)
                    conn_list.append(1)
                    print(f"multi worded connector: {token}")
                # intermediate connectors (connector length > 2)
                else:
                    print(f"large multi worded connector: {token}")
                    processed_tokens.append(token)
                    conn_list.append(1)
                
            else:
                print(f"normal token: {token}")
                processed_tokens.append(token)
                conn_list.append(0)
        article_token_list.append(processed_tokens)
        article_parsed_list.append(conn_list)
        print(processed_tokens)
        print(conn_list)
        print()
        
    return article_token_list, article_parsed_list

In [203]:
# Read the XML file
with open("french_corpus_validated.xml", "r", encoding="utf-8") as file:
    xml_data = file.read()

# Initialize lists to store all tokens and connectors
all_processed_tokens_list_french = []
all_conn_list_french = []

# Define regex patterns
article_pattern = re.compile(r'<ARTICLE id="(.*?)">(.*?)</ARTICLE>', re.DOTALL)
sent_pattern = re.compile(r'<SENT id="(.*?)">(.*?)</SENT>', re.DOTALL)
conn_pattern = re.compile(r'<CONN>(.*?)</CONN>')
# Find all articles
articles = re.findall(article_pattern, xml_data)

# Iterate over articles
for article_id, article_content in articles:
    # Find all sentences in the article
    sentences = re.findall(sent_pattern, article_content)
    #process samples of given article
    processed_tokens, conn_list = process_text_with_conn_tag(sentences)
    # appending the results in the all tokens and connectors list
    for inner_list in processed_tokens:
        all_processed_tokens_list_french.append(inner_list)
    for inner_list in conn_list:
        all_conn_list_french.append(inner_list)

['Dans', 'ce', 'groupe', 'diversifié', '(', 'transports', ',', 'textile', ',', 'télévision', ')', 'dont', 'la', 'devise', 'est', ',', 'selon', 'le', 'mot', 'du', 'PDG', ',', 'Mr', 'Jérôme', 'Seydoux', ',', '"', 'gagner', 'de', "l'", 'argent', ',', "c'", 'est', 'vertueux', '"', ',', "l'", 'activité', 'aérienne', 'a', 'dégagé', 'un', 'résultat', 'bénéficiaire', 'net', 'de', '729', 'millions', 'de', 'francs', ',', 'sur', 'un', 'total', 'consolidé', 'de', '913', 'millions', 'de', 'francs', '.']
normal token: Dans
normal token: ce
normal token: groupe
normal token: diversifié
normal token: (
normal token: transports
normal token: ,
normal token: textile
normal token: ,
normal token: télévision
normal token: )
normal token: dont
normal token: la
normal token: devise
normal token: est
normal token: ,
normal token: selon
normal token: le
normal token: mot
normal token: du
normal token: PDG
normal token: ,
normal token: Mr
normal token: Jérôme
normal token: Seydoux
normal token: ,
normal token:

normal token: pays
normal token: "
normal token: ,
normal token: a
normal token: ajouté
normal token: l'
normal token: agence
normal token: ,
normal token: estimant
normal token: que
normal token: ces
normal token: mesures
normal token: "
normal token: peuvent
normal token: être
normal token: qualifiées
normal token: avec
normal token: certitude
normal token: d'
normal token: état
normal token: d'
normal token: urgence
normal token: "
normal token: .
['"', 'Soucieuses', "d'", 'empêcher', 'des', 'manifestations', 'de', 'protestation', ',', 'les', 'autorités', 'albanaises', 'ont', 'mis', 'en', 'place', 'des', 'mesures', 'extrêmement', 'rigoureuses', 'de', 'contrôle', 'de', 'la', 'situation', 'dans', 'le', 'pays', '"', ',', 'a', 'ajouté', "l'", 'agence', ',', 'estimant', 'que', 'ces', 'mesures', '"', 'peuvent', 'être', 'qualifiées', 'avec', 'certitude', "d'", 'état', "d'", 'urgence', '"', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

normal token: étrangers
normal token: et
normal token: peuplée
normal token: surtout
normal token: de
normal token: Russes
normal token: ,
normal token: notamment
normal token: les
normal token: ouvriers
normal token: d'
normal token: une
normal token: usine
normal token: de
normal token: réparation
normal token: d'
normal token: avions
normal token: qui
normal token: lui
normal token: ont
normal token: exprimé
normal token: leur
normal token: soutien
normal token: ,
normal token: a
normal token: indiqué
normal token: un
normal token: porte
normal token: -
normal token: parole
normal token: officiel
normal token: lituanien
normal token: .
['Vendredi', ',', 'il', "s'", 'était', 'rendu', 'à', 'Shauliai', ',', 'une', 'ville', 'interdite', 'aux', 'étrangers', 'et', 'peuplée', 'surtout', 'de', 'Russes', ',', 'notamment', 'les', 'ouvriers', "d'", 'une', 'usine', 'de', 'réparation', "d'", 'avions', 'qui', 'lui', 'ont', 'exprimé', 'leur', 'soutien', ',', 'a', 'indiqué', 'un', 'porte', '-', 'pa

normal token: fait
normal token: particulière
normal token: .
['Car', 'la', 'situation', 'de', 'la', 'Lituanie', 'est', 'tout', 'à', 'fait', 'particulière', '.']
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['La', 'fièvre', 'indépendantiste', "s'", 'y', 'explique', 'en', 'grande', 'partie', 'par', 'le', 'fait', 'que', 'les', 'Lituaniens', 'sont', 'largement', 'majoritaires', '(', '80', '%', ')', 'dans', 'leur', 'République', '<CONN>et</CONN>', "qu'", 'elle', 'ne', 'fait', 'partie', 'de', "l'", 'URSS', ',', 'avec', 'les', 'deux', 'autres', 'Républiques', 'baltes', ',', 'que', 'depuis', 'peu', '.']
normal token: La
normal token: fièvre
normal token: indépendantiste
normal token: s'
normal token: y
normal token: explique
normal token: en
normal token: grande
normal token: partie
normal token: par
normal token: le
normal token: fait
normal token: que
normal token: les
normal token: Lituaniens
normal token: sont
normal token: largement
normal token: majoritaires
normal token: (
normal token: 80
no

normal token: Jalilabad
normal token: ,
normal token: qui
normal token: s'
normal token: était
normal token: déjà
normal token: insurgée
normal token: en
normal token: décembre
normal token: dernier
normal token: .
["L'", 'agence', 'rapporte', 'également', "qu'", 'à', 'Lenkoran', ',', 'où', 'la', 'radio', 'a', 'été', 'occupée', ',', 'de', 'même', 'que', "d'", 'autres', 'bâtiments', 'officiels', ',', '"', 'tout', 'le', 'pouvoir', 'est', 'passé', 'aux', 'mains', 'du', 'front', 'populaire', '"', 'et', 'que', 'la', 'situation', "s'", 'aggrave', 'aussi', 'dans', 'la', 'ville', 'proche', 'de', 'Jalilabad', ',', 'qui', "s'", 'était', 'déjà', 'insurgée', 'en', 'décembre', 'dernier', '.']
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['-', 'Les', 'pourparlers', 'sur', 'le', 'retrait', 'des', 'troupes', 'soviétiques', 'de', 'Tchécoslovaquie', 'devraient', 'commencer',

normal token: des
normal token: parlementaires
normal token: ,
normal token: y
normal token: compris
normal token: communistes
normal token: ,
normal token: qui
normal token: ont
normal token: décidé
normal token: d'
normal token: ouvrir
normal token: une
normal token: enquête
normal token: disciplinaire
normal token: contre
normal token: les
normal token: deux
normal token: hauts
normal token: fonctionnaires
normal token: .
["L'", 'incapacité', '-', 'ou', 'le', 'refus', '-', 'du', 'procureur', 'général', 'adjoint', 'de', "l'", 'état', 'et', 'du', 'président', 'de', 'la', 'Cour', 'suprême', 'à', 'répondre', 'aux', 'questions', 'des', 'députés', 'sur', 'les', 'enquêtes', 'en', 'cours', 'à', 'propos', 'des', 'activités', 'de', "l'", 'ancienne', 'Stasi', 'avait', 'provoqué', 'pendant', 'les', 'débats', 'un', 'tumulte', 'inhabituel', 'sur', 'les', 'bancs', 'des', 'parlementaires', ',', 'y', 'compris', 'communistes', ',', 'qui', 'ont', 'décidé', "d'", 'ouvrir', 'une', 'enquête', 'disciplina

normal token: que
normal token: l'on
normal token: a
normal token: appelé
normal token: le
normal token: "
normal token: Yalta
normal token: aérien
normal token: "
normal token: français
normal token: qui
normal token: ,
normal token: en
normal token: 1963
normal token: ,
normal token: avait
normal token: partagé
normal token: le
normal token: monde
normal token: entre
normal token: Air
normal token: France
normal token: et
normal token: UTA
normal token: et
normal token: réservé
normal token: la
normal token: métropole
normal token: à
normal token: Air
normal token: Inter
normal token: .
["C'", 'est', 'la', 'fin', 'de', 'ce', 'que', "l'on", 'a', 'appelé', 'le', '"', 'Yalta', 'aérien', '"', 'français', 'qui', ',', 'en', '1963', ',', 'avait', 'partagé', 'le', 'monde', 'entre', 'Air', 'France', 'et', 'UTA', 'et', 'réservé', 'la', 'métropole', 'à', 'Air', 'Inter', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['La'

normal token: ;
normal token: ils
normal token: n'
normal token: accèdent
normal token: pas
normal token: non
normal token: plus
normal token: aux
normal token: demandes
normal token: d'
normal token: Air
normal token: France
normal token: d'
normal token: absorber
normal token: Air
normal token: Inter
normal token: .
['Coincés', 'entre', 'leur', 'titre', 'de', 'propriétaire', "d'", 'Air', 'France', 'et', 'le', 'libéralisme', 'montant', ',', 'les', 'gouvernements', 'louvoient', 'à', 'vue', ',', 'empêchant', 'UTA', 'de', 'prendre', 'son', 'essor', 'en', 'lui', 'refusant', 'les', 'nouvelles', 'destinations', "qu'", 'elle', 'lui', 'réclame', ';', 'ils', "n'", 'accèdent', 'pas', 'non', 'plus', 'aux', 'demandes', "d'", 'Air', 'France', "d'", 'absorber', 'Air', 'Inter', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['UTA', 'tente', 'de', 'se', 'sortir', 'des', 'marchés', 'a

normal token: en
normal token: mesure
normal token: de
normal token: poursuivre
normal token: la
normal token: remise
normal token: en
normal token: question
normal token: des
normal token: avantages
normal token: sociaux
normal token: qu'
normal token: il
normal token: avait
normal token: entreprise
normal token: à
normal token: marche
normal token: forcée
normal token: .
['Les', 'syndicats', 'des', 'personnels', 'navigants', 'se', 'réjouiront', 'aussi', 'que', 'Mr', 'René', 'Lapautre', ',', 'PDG', "d'", 'UTA', ',', 'ne', 'soit', 'plus', 'en', 'mesure', 'de', 'poursuivre', 'la', 'remise', 'en', 'question', 'des', 'avantages', 'sociaux', "qu'", 'il', 'avait', 'entreprise', 'à', 'marche', 'forcée', '.']
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

["<CONN>S'</CONN>", 'il', 'résout', 'beaucoup', 'de', 'problèmes', ',', "l'", 'accord', 'entre', 'Chargeurs', 'et', 'Air', 'France', 'ne', 'peut', 'prétendre', 'tout', '

normal token: ,
normal token: nombreux
normal token: sont
normal token: ceux
normal token: qui
normal token: ,
normal token: en
normal token: France
normal token: ,
normal token: auraient
normal token: souhaité
normal token: plus
normal token: de
normal token: générosité
normal token: ,
normal token: jugeant
normal token: nécessaire
normal token: un
normal token: taux
normal token: d'
normal token: intérêt
normal token: de
normal token: 10
normal token: %
normal token: .
['Comme', "l'", 'emprunteur', "s'", 'apprête', 'à', 'revenir', 'sur', 'le', 'marché', 'du', 'franc', 'dans', 'une', 'dizaine', 'de', 'jours', 'seulement', ',', 'nombreux', 'sont', 'ceux', 'qui', ',', 'en', 'France', ',', 'auraient', 'souhaité', 'plus', 'de', 'générosité', ',', 'jugeant', 'nécessaire', 'un', 'taux', "d'", 'intérêt', 'de', '10', '%', '.']
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Sa', 'prochaine', 'émission', 'devr

normal token: se
normal token: redresser
normal token: un
normal token: peu
normal token: à
normal token: 3
normal token: ,
normal token: 4125
normal token: PP
normal token: ,
single worded connector: <CONN>puis</CONN>
normal token: à
normal token: 3
normal token: ,
normal token: 4140
normal token: PP
normal token: à
normal token: la
normal token: veille
normal token: du
normal token: week
normal token: -
normal token: end
normal token: ,
normal token: en
normal token: liaison
normal token: avec
normal token: l'
normal token: accès
normal token: de
normal token: faiblesse
normal token: du
normal token: dollar
normal token: et
normal token: la
normal token: remontée
normal token: du
normal token: mark
normal token: .
['Le', 'cours', 'du', 'mark', 'à', 'Paris', 'est', ',', 'illico', ',', 'redescendu', 'de', '3', ',', '4175', 'PP', 'à', '3', ',', '4075', 'F', "d'", 'abord', ',', 'avant', 'de', 'se', 'redresser', 'un', 'peu', 'à', '3', ',', '4125', 'PP', ',', 'puis', 'à', '3', ',', '4140',

normal token: ,
normal token: 50
normal token: %
single worded connector: <CONN>et</CONN>
normal token: ,
normal token: on
normal token: commence
normal token: outre-
normal token: Rhin
normal token: ,
normal token: à
normal token: trouver
normal token: des
normal token: emprunts
normal token: privés
normal token: à
normal token: plus
normal token: de
normal token: 8
normal token: %
normal token: .
['De', 'même', ',', 'le', 'rendement', 'des', 'emprunts', "d'", 'état', 'à', 'long', 'terme', ',', 'le', '"', 'Bund', '"', 'dix', 'ans', 'notamment', ',', 'se', 'maintient', 'à', 'un', 'peu', 'plus', 'de', '7', ',', '50', '%', 'et', ',', 'on', 'commence', 'outre-', 'Rhin', ',', 'à', 'trouver', 'des', 'emprunts', 'privés', 'à', 'plus', 'de', '8', '%', '.']
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Aux', 'Etats', '-', 'Unis', ',', "l'", 'événement', 'de', 'la', 'semaine', ',', 'large

normal token: l'
normal token: attention
normal token: sur
normal token: lui
normal token: ,
normal token: le
normal token: titane
normal token: .
["L'", 'année', '1989', 'aura', 'remis', 'en', 'selle', 'un', 'métal', 'qui', "n'", 'avait', 'plus', 'depuis', 'longtemps', 'attiré', "l'", 'attention', 'sur', 'lui', ',', 'le', 'titane', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Les', 'prix', 'ont', 'progressé', 'de', 'quelque', '30', '%', 'en', 'douze', 'mois', ',', 'traduisant', 'de', 'nouvelles', 'tensions', 'sur', 'la', 'demande', 'de', 'minerai', 'comme', 'de', 'métal', '.']
normal token: Les
normal token: prix
normal token: ont
normal token: progressé
normal token: de
normal token: quelque
normal token: 30
normal token: %
normal token: en
normal token: douze
normal token: mois
normal token: ,
normal token: traduisant
normal token: de
normal token: nouvelles
normal token: tensions
normal token: sur
normal token: la
normal token: demande
normal tok

normal token: pour
normal token: lundi
normal token: prochain
normal token: dans
normal token: un
normal token: autre
normal token: quartier
normal token: de
normal token: la
normal token: ville
normal token: .
["D'", 'ores', 'et', 'déjà', ',', 'une', 'autre', 'réunion', 'est', 'prévue', 'pour', 'lundi', 'prochain', 'dans', 'un', 'autre', 'quartier', 'de', 'la', 'ville', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

["L'", 'indication', 'donnée', 'par', 'le', 'ministère', ',', 'selon', 'laquelle', 'les', 'manifestants', 'étaient', 'venus', 'discuter', 'de', '"', 'problèmes', 'publics', '"', ',', 'laisse', 'clairement', 'entendre', "qu'", 'ils', "s'", 'étaient', 'réunis', 'pour', 'demander', 'le', 'retour', 'de', 'la', 'vie', 'parlementaire', ',', 'suspendue', 'depuis', '1986', '.']
normal token: L'
normal token: indication
normal token: donnée
normal token: par
normal token: le
normal token: ministère
normal token: ,
normal token: selon
normal token: laquelle
no

normal token: attaques
normal token: sur
normal token: les
normal token: usines
normal token: Renault
normal token: de
normal token: Billancourt
normal token: ,
normal token: le
normal token: 3
normal token: mars
normal token: 1942
normal token: ,
normal token: le
normal token: 4
normal token: avril
normal token: 1943
normal token: et
normal token: les
normal token: 3
normal token: et
normal token: 15
normal token: septembre
normal token: 1943
normal token: .
['Durant', 'les', 'années', 'de', 'guerre', ',', 'les', 'forces', 'aériennes', 'alliées', 'menèrent', 'quatre', 'attaques', 'sur', 'les', 'usines', 'Renault', 'de', 'Billancourt', ',', 'le', '3', 'mars', '1942', ',', 'le', '4', 'avril', '1943', 'et', 'les', '3', 'et', '15', 'septembre', '1943', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Le', 'premier', 'bombardement', 'qui', 'allait', 'ouvrir', "l'", 'ère', 'de', 'la', 'destruction', 'de', 'nombreux', 'si

normal token: soixante
normal token: -
normal token: sept
normal token: ans
normal token: ,
normal token: Louis
normal token: Renault
normal token: était
normal token: en
normal token: mauvais
normal token: état
normal token: physique
normal token: .
['Alors', 'âgé', 'de', 'soixante', '-', 'sept', 'ans', ',', 'Louis', 'Renault', 'était', 'en', 'mauvais', 'état', 'physique', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Il', 'souffrait', 'de', 'troubles', 'urinaires', '.']
normal token: Il
normal token: souffrait
normal token: de
normal token: troubles
normal token: urinaires
normal token: .
['Il', 'souffrait', 'de', 'troubles', 'urinaires', '.']
[0, 0, 0, 0, 0, 0]

['Atteint', "d'", 'aphasie', ',', 'il', 'ne', 'pouvait', "s'", 'exprimer', "qu'", 'avec', 'difficulté', '.']
normal token: Atteint
normal token: d'
normal token: aphasie
normal token: ,
normal token: il
normal token: ne
normal token: pouvait
normal token: s'
normal token: exprimer
normal token: qu'
normal token: a

normal token: demander
normal token: l'
normal token: extradition
normal token: d'
normal token: alois
normal token: Brunner
normal token: ,
normal token: soixante
normal token: -
normal token: dix
normal token: -
normal token: huit
normal token: ans
normal token: ,
normal token: criminel
normal token: de
normal token: guerre
normal token: nazi
normal token: .
['Mr', 'Serge', 'Klarsfeld', ',', 'président', 'de', "l'", 'association', 'des', 'fils', 'et', 'filles', 'de', 'déportés', 'juifs', 'de', 'France', ',', 'a', 'été', 'expulsé', 'de', 'Syrie', ',', 'vendredi', '12', 'mars', ',', 'après', 'avoir', 'tenté', 'de', 'demander', "l'", 'extradition', "d'", 'alois', 'Brunner', ',', 'soixante', '-', 'dix', '-', 'huit', 'ans', ',', 'criminel', 'de', 'guerre', 'nazi', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Arrivé', 'le', '9', 'janvier', 'à', 'Damas', ',', "l'", 'avocat', 

normal token: manifesté
normal token: pacifiquement
normal token: ,
normal token: jeudi
normal token: 11
normal token: janvier
normal token: ,
normal token: dans
normal token: quatre
normal token: Etats
normal token: du
normal token: nord
normal token: du
normal token: Nigéria
normal token: pour
normal token: protester
normal token: contre
normal token: le
normal token: "
normal token: processus
normal token: d'
normal token: islamisation
normal token: "
normal token: du
normal token: pays
normal token: .
['Des', 'milliers', 'de', 'chrétiens', 'ont', 'manifesté', 'pacifiquement', ',', 'jeudi', '11', 'janvier', ',', 'dans', 'quatre', 'Etats', 'du', 'nord', 'du', 'Nigéria', 'pour', 'protester', 'contre', 'le', '"', 'processus', "d'", 'islamisation', '"', 'du', 'pays', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Ils', 'estiment', ',', '<CONN>en', 'effet</CONN>', ',', 'que', 'le', 'remaniement', 'ministériel', 'du', '29', 'décembre'

normal token: la
normal token: parole
normal token: quelques
normal token: heures
normal token: plus
normal token: tôt
normal token: devant
normal token: les
normal token: députés
normal token: pour
normal token: répondre
normal token: à
normal token: ses
normal token: détracteurs
normal token: .
['Soupçonné', 'de', 'vouloir', 'monopoliser', 'le', 'pouvoir', 'au', 'profit', 'du', 'seul', 'PC', ',', 'de', 'ne', 'pas', 'tenir', 'ses', 'promesses', 'sur', 'le', 'démantèlement', 'des', 'structures', 'de', "l'", 'ancien', 'régime', ',', 'le', 'premier', 'ministre', 'avait', 'pris', 'la', 'parole', 'quelques', 'heures', 'plus', 'tôt', 'devant', 'les', 'députés', 'pour', 'répondre', 'à', 'ses', 'détracteurs', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Sur', 'un', 'ton', 'offensif', ',', 'Mr', 'Modrow', 'avait', 'refusé', 'de', 'revenir', 'sur', 'sa', 'décision', 'de', 'remplacer', "l'

normal token: de
normal token: cela
normal token: .
['La', 'mise', 'en', 'cause', 'de', "l'", 'intégrité', 'du', 'gouvernement', 'Modrow', 'est', 'un', 'échec', 'grave', 'pour', 'le', 'Parti', 'communiste', ',', 'qui', "n'", 'avait', 'pas', 'besoin', 'de', 'cela', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Le', 'SED', 'continue', 'à', 'faire', 'eau', 'de', 'toutes', 'parts', 'malgré', 'la', 'volonté', 'réformatrice', 'affichée', 'par', 'sa', 'nouvelle', 'direction', 'élue', 'en', 'décembre', '.']
normal token: Le
normal token: SED
normal token: continue
normal token: à
normal token: faire
normal token: eau
normal token: de
normal token: toutes
normal token: parts
normal token: malgré
normal token: la
normal token: volonté
normal token: réformatrice
normal token: affichée
normal token: par
normal token: sa
normal token: nouvelle
normal token: direction
normal token: élue
normal token: en
normal token: décembre
normal token: .
['Le', 'SED', 

normal token: ,
normal token: les
normal token: partis
normal token: politiques
normal token: Ouest
normal token: -
normal token: allemands
normal token: contre
normal token: une
normal token: immixtion
normal token: trop
normal token: marquée
normal token: dans
normal token: la
normal token: campagne
normal token: électorale
normal token: est
normal token: -
normal token: allemande
normal token: .
['Le', 'premier', 'ministre', 'est', '-', 'allemand', 'a', 'mis', 'en', 'garde', ',', 'jeudi', 'devant', 'le', 'parlement', ',', 'les', 'partis', 'politiques', 'Ouest', '-', 'allemands', 'contre', 'une', 'immixtion', 'trop', 'marquée', 'dans', 'la', 'campagne', 'électorale', 'est', '-', 'allemande', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Bien', 'obligé', 'de', 'tenir', 'compte', 'des', 'réalités', ',', 'il', 'devait', '<CONN>néanmoins</CONN>', 'recevoir', 'successivement', ',', 'vendredi', 'et', 'samedi', ',', 'les', 

['Nous', 'avons', 'fait', 'beaucoup', 'pour', 'vous', ',', 'leur', 'a', '-t-il', 'déclaré', 'en', 'substance', ',', 'nous', 'vous', 'avons', 'favorisé', ';', 'il', 'va', 'falloir', 'bientôt', 'rendre', 'des', 'comptes', 'à', 'la', 'collectivité', ',', 'aux', 'salariés', 'notamment', ',', 'qui', 'ont', 'été', 'maltraités', ',', 'ont', 'vu', 'leur', 'pouvoir', "d'", 'achat', 'réduit', ',', 'beaucoup', 'de', 'leurs', 'emplois', 'supprimés', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Là', 'encore', 'le', 'premier', 'ministre', "n'", 'a', 'pas', 'la', 'tâche', 'facile', '<CONN>car</CONN>', 'il', 'va', 'devoir', 'trancher', 'sur', 'le', 'point', 'de', 'savoir', 'si', 'la', 'politique', "qu'", 'il', 'mène', '-', 'fiscale', 'notamment', '-', 'doit', 'continuer', 'de', 'favoriser', 'les', 'entreprises', 'et', 'leur', 'compétitivité', '<CONN>alors', 'même', 'que</CONN>', '

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['-', 'La', 'dernière', 'rentrée', 'universitaire', 'a', 'fait', 'apparaître', 'de', 'sérieuses', 'difficultés', "d'", 'accueil', 'et', "d'", 'encadrement', 'des', 'étudiants', '.']
normal token: -
normal token: La
normal token: dernière
normal token: rentrée
normal token: universitaire
normal token: a
normal token: fait
normal token: apparaître
normal token: de
normal token: sérieuses
normal token: difficultés
normal token: d'
normal token: accueil
normal token: et
normal token: d'
normal token: encadrement
normal token: des
normal token: étudiants
normal token: .
['-', 'La', 'dernière', 'rentrée', 'universitaire', 'a', 'fait', 'apparaître', 'de', 'sérieuses', 'difficultés', "d'", 'accueil', 'et', "d'", 'encadrement', 'des', 'étudiants', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['<CONN>Or</CONN>', ',', 'celles', '-', 'ci', 'risquent', 'de', "s'", 'accroitre', 'encore', 'avec', "l'", 'augmentation', 'anno

normal token: "
normal token: génération
normal token: sacrifiée
normal token: "
normal token: .
['Je', 'veux', 'agir', 'vite', ',', 'car', ',', 'pour', 'moi', ',', 'il', 'ne', 'saurait', 'y', 'avoir', 'de', '"', 'génération', 'sacrifiée', '"', '.']
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['-', 'Ne', 'risque', '-t-on', 'pas', 'de', 'se', 'retrouver', 'dans', 'la', 'situation', 'des', 'années', '60', 'où', ',', '<CONN>pour</CONN>', 'faire', 'face', 'à', "l'", 'urgence', ',', 'on', 'a', 'dû', 'construire', ',', 'en', 'catastrophe', ',', 'des', 'bâtiments', 'universitaires', 'de', 'qualité', 'médiocre', '?']
normal token: -
normal token: Ne
normal token: risque
normal token: -t-on
normal token: pas
normal token: de
normal token: se
normal token: retrouver
normal token: dans
normal token: la
normal token: situation
normal token: des
normal token: années
normal token: 60
normal token: où
normal token: ,
single worded connector: <CONN>pour</CONN>
normal token: faire


['Il', 'faut', 'voir', 'les', 'situations', 'cas', 'par', 'cas', ',', 'en', 'tenant', 'compte', 'des', 'possibilités', 'locales', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]

['-', 'Il', 'est', 'plus', 'facile', 'de', 'construire', 'des', 'mètres', 'carrés', 'que', 'de', 'recruter', 'des', 'enseignants', '.']
normal token: -
normal token: Il
normal token: est
normal token: plus
normal token: facile
normal token: de
normal token: construire
normal token: des
normal token: mètres
normal token: carrés
normal token: que
normal token: de
normal token: recruter
normal token: des
normal token: enseignants
normal token: .
['-', 'Il', 'est', 'plus', 'facile', 'de', 'construire', 'des', 'mètres', 'carrés', 'que', 'de', 'recruter', 'des', 'enseignants', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Comment', 'comptez', '-vous', 'remédier', 'à', 'la', 'crise', 'actuelle', ',', '<CONN>pour</CONN>', 'assurer', 'un', 'encadrement', 'convenable', 'des', 'étudiants', '?']
normal to

normal token: va
normal token: y
normal token: avoir
normal token: ,
normal token: sur
normal token: ce
normal token: sujet
normal token: ,
normal token: une
normal token: série
normal token: de
normal token: colloques
normal token: dans
normal token: les
normal token: universités
normal token: en
normal token: février
normal token: ,
normal token: suivis
normal token: d'
normal token: un
normal token: colloque
normal token: national
normal token: en
normal token: mars
normal token: .
['Il', 'va', 'y', 'avoir', ',', 'sur', 'ce', 'sujet', ',', 'une', 'série', 'de', 'colloques', 'dans', 'les', 'universités', 'en', 'février', ',', 'suivis', "d'", 'un', 'colloque', 'national', 'en', 'mars', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['À', 'partir', 'de', 'ces', 'discussions', ',', 'nous', 'prendrons', 'des', 'décisions', 'qui', 'pourront', 'être', 'en', 'partie', 'applicables', 'à', 'la', 'rentrée', '1990', '.']
normal token: À
normal token: par

normal token: .
['Ils', 'avaient', 'préféré', 'prendre', 'la', 'fuite', 'en', 'Côte', '-', "d'", 'Ivoire', 'alors', "qu'", 'ils', 'étaient', 'assaillis', 'par', 'les', 'maquisards', 'le', '6', 'janvier', 'à', 'Logata', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

["L'", 'ambassadeur', 'du', 'Libéria', 'en', 'Côte', '-', "d'", 'Ivoire', "s'", 'est', 'inquiété', 'de', 'leur', 'sort', '<CONN>et</CONN>', 'demande', "qu'", 'ils', 'soient', 'relâchés', '.', '.']
normal token: L'
normal token: ambassadeur
normal token: du
normal token: Libéria
normal token: en
normal token: Côte
normal token: -
normal token: d'
normal token: Ivoire
normal token: s'
normal token: est
normal token: inquiété
normal token: de
normal token: leur
normal token: sort
single worded connector: <CONN>et</CONN>
normal token: demande
normal token: qu'
normal token: ils
normal token: soient
normal token: relâchés
normal token: .
normal token: .
["L'", 'ambassadeur', 'du', 'Libéria', 'en

normal token: 3
normal token: ,
normal token: 886
normal token: milliards
normal token: de
normal token: francs
normal token: après
normal token: un
normal token: solde
normal token: négatif
normal token: de
normal token: 7
normal token: ,
normal token: 31
normal token: milliards
normal token: en
normal token: octobre
normal token: ,
normal token: le
normal token: solde
normal token: cumulé
normal token: depuis
normal token: janvier
normal token: s'
normal token: élevant
normal token: à
normal token: -
normal token: 44
normal token: ,
normal token: 772
normal token: milliards
normal token: de
normal token: francs
normal token: contre
normal token: -
normal token: 29
normal token: ,
normal token: 519
normal token: milliards
normal token: durant
normal token: la
normal token: même
normal token: période
normal token: de
normal token: 1988
normal token: .
['En', 'données', 'brutes', ',', 'le', 'commerce', 'extérieur', 'français', 'a', 'enregistré', 'un', 'déficit', 'de', '3', ',', '886', '

single worded connector: <CONN>en</CONN>
normal token: ne
normal token: rémunérant
normal token: pas
normal token: correctement
normal token: les
normal token: services
normal token: rendus
normal token: par
normal token: celle
normal token: -
normal token: ci
normal token: .
['Ce', 'faisant', ',', 'elle', "s'", 'est', 'attiré', 'le', 'courroux', 'du', 'chef', 'de', "l'", 'état', 'qui', 'avait', 'dit', 'en', '1989', 'en', 'conseil', 'des', 'ministres', "qu'", 'EDF', 'avait', 'réalisé', 'un', '"', 'hold', '-', 'up', '"', 'sur', 'la', 'compagnie', 'nationale', 'du', 'Rhône', 'en', 'ne', 'rémunérant', 'pas', 'correctement', 'les', 'services', 'rendus', 'par', 'celle', '-', 'ci', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Mr', 'Mitterrand', "s'", 'est', 'souvenu', '<CONN>aussi</CONN>', 'du', 'tour', 'de', 'passe', '-', 'passe', 'du', 'ministère', 'des', 'finances', 'qui

normal token: ,
single worded connector: <CONN>mais</CONN>
normal token: a
normal token: affirmé
normal token: qu'
normal token: il
normal token: était
normal token: urgent
normal token: d'
normal token: autoriser
normal token: l'
normal token: autonomie
normal token: des
normal token: petites
normal token: et
normal token: moyennes
normal token: entreprises
normal token: .
['Le', 'ministre', "n'", 'a', 'pas', 'donné', 'de', 'date', 'précise', 'pour', 'le', 'démantèlement', 'des', 'quelque', '150', 'groupes', 'industriels', 'responsables', 'de', 'la', 'majorité', 'de', 'la', 'production', 'en', 'RDA', ',', 'mais', 'a', 'affirmé', "qu'", 'il', 'était', 'urgent', "d'", 'autoriser', "l'", 'autonomie', 'des', 'petites', 'et', 'moyennes', 'entreprises', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['La', 'veille', ',', 'Mme', 'Luft', 'avait', 'déjà', 'annoncé', 'que', 'des', 'capitaux', 'ouest', '-

normal token: qui
normal token: n'
normal token: avaient
normal token: pas
normal token: été
normal token: consultés
normal token: ,
normal token: présentèrent
single worded connector: <CONN>aussitôt</CONN>
normal token: un
normal token: recours
normal token: .
['Aussi', 'nommèrent', '-ils', 'immédiatement', 'comme', 'successeur', 'de', 'Pedro', 'Toledo', 'son', 'bras', 'droit', ',', 'Mr', 'Alfredo', 'Saelz', ':', 'une', 'décision', 'contre', 'laquelle', 'les', 'représentants', 'de', 'la', 'banque', 'de', 'Bilbao', ',', 'qui', "n'", 'avaient', 'pas', 'été', 'consultés', ',', 'présentèrent', 'aussitôt', 'un', 'recours', '.']
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

['Dans', 'ces', 'conditions', ',', 'le', 'recours', 'à', 'la', 'banque', "d'", 'Espagne', 'représentait', "l'", 'ultime', 'solution', 'en', 'dehors', "d'", 'une', 'assemblée', 'générale', 'des', 'actionnaires', 'des', 'deux', 'banques', 'qui',

normal token: actions
normal token: britanniques
normal token: en
normal token: Grande
normal token: -
normal token: Bretagne
normal token: ,
normal token: à
normal token: l'
normal token: exception
normal token: de
normal token: celles
normal token: portant
normal token: sur
normal token: une
normal token: centaine
normal token: de
normal token: petites
normal token: sociétés
normal token: à
normal token: forte
normal token: croissance
normal token: .
['Citicorp', 'abandonne', 'ses', 'activités', 'de', 'courtage', 'sur', 'actions', 'britanniques', 'en', 'Grande', '-', 'Bretagne', ',', 'à', "l'", 'exception', 'de', 'celles', 'portant', 'sur', 'une', 'centaine', 'de', 'petites', 'sociétés', 'à', 'forte', 'croissance', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Toutefois', ',', 'les', 'activités', 'de', 'CSV', 'en', 'matière', "d'", 'options', 'et', 'de', 'gestions', 'de', 'portefeuille', ',', 'ne', 'sont', 'pas', 'affectées', 'par'

normal token: une
normal token: entreprise
normal token: agro-
normal token: industrielle
normal token: .
['Il', 'est', 'payable', 'en', 'devises', ':', 'Interinfo', 'est', 'en', 'effet', 'une', 'entreprise', 'agro-', 'industrielle', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]

['Elle', 'vendra', 'à', "l'", 'extérieur', 'des', 'produits', 'agricoles', '<CONN>pour</CONN>', 'se', 'procurer', 'des', 'devises', "qu'", 'elle', 'utilisera', '<CONN>ensuite</CONN>', 'pour', 'acheter', 'des', 'produits', 'industriels', '.']
normal token: Elle
normal token: vendra
normal token: à
normal token: l'
normal token: extérieur
normal token: des
normal token: produits
normal token: agricoles
single worded connector: <CONN>pour</CONN>
normal token: se
normal token: procurer
normal token: des
normal token: devises
normal token: qu'
normal token: elle
normal token: utilisera
single worded connector: <CONN>ensuite</CONN>
normal token: pour
normal token: acheter
normal token: des
normal token: produit

normal token: les
normal token: aspects
normal token: des
normal token: politiques
normal token: publiques
normal token: peuvent
normal token: être
normal token: examinés
normal token: par
normal token: le
normal token: comité
normal token: interministériel
normal token: ,
normal token: à
normal token: l'
normal token: exception
normal token: des
normal token: sujets
normal token: de
normal token: caractère
normal token: secret
normal token: touchant
normal token: à
normal token: la
normal token: défense
normal token: nationale
normal token: ,
normal token: à
normal token: la
normal token: politique
normal token: extérieure
normal token: et
normal token: à
normal token: la
normal token: sûreté
normal token: intérieure
normal token: et
normal token: extérieure
normal token: de
normal token: l'
normal token: Etat
normal token: .
['Tous', 'les', 'aspects', 'des', 'politiques', 'publiques', 'peuvent', 'être', 'examinés', 'par', 'le', 'comité', 'interministériel', ',', 'à', "l'", 'exception

normal token: national
normal token: de
normal token: normalisation
normal token: ,
normal token: désormais
normal token: confiée
normal token: à
normal token: l'
normal token: association
normal token: française
normal token: de
normal token: normalisation
normal token: .
['Les', 'entreprises', 'seront', 'associées', 'au', 'programme', 'national', 'de', 'normalisation', ',', 'désormais', 'confiée', 'à', "l'", 'association', 'française', 'de', 'normalisation', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

["L'", 'opération', '"', 'partenariat', '92', '"', 'lancée', 'en', '1988', 'est', 'reconduite', 'en', '1990', 'avec', 'une', 'dotation', 'de', '100', 'millions', 'de', 'francs', '<CONN>afin', 'de</CONN>', 'soutenir', 'les', 'projets', "d'", 'actions', 'communes', 'à', 'plusieurs', 'entreprises', 'visant', 'à', 'développer', 'les', 'systèmes', 'de', 'normalisation', ',', 'de', 'certification', 'et', "d'", 'essai', ';', 'la', 'création', 'de', 'nouveaux', 'bureaux', 'd

["<CONN>C'", 'est', 'pourquoi</CONN>', 'je', 'propose', 'que', 'le', 'prix', 'Nobel', 'de', 'la', 'paix', '1990', 'soit', 'attribué', 'au', 'peuple', 'roumain', 'tout', 'entier', ',', '<CONN>et</CONN>', 'que', ',', 'pour', 'une', 'fois', ',', 'ce', 'symbole', 'ne', 'soit', 'pas', 'attribué', "qu'", 'à', 'une', 'seule', 'personne', '.']
multi worded connector: <CONN>C'
large multi worded connector: est
multi worded connector: pourquoi</CONN>
normal token: je
normal token: propose
normal token: que
normal token: le
normal token: prix
normal token: Nobel
normal token: de
normal token: la
normal token: paix
normal token: 1990
normal token: soit
normal token: attribué
normal token: au
normal token: peuple
normal token: roumain
normal token: tout
normal token: entier
normal token: ,
single worded connector: <CONN>et</CONN>
normal token: que
normal token: ,
normal token: pour
normal token: une
normal token: fois
normal token: ,
normal token: ce
normal token: symbole
normal token: ne
normal to

normal token: du
normal token: droit
normal token: social
normal token: .
["C'", 'est', 'par', 'exemple', 'le', 'cas', 'du', 'droit', 'social', '.']
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0]

['En', "l'", 'espace', 'de', 'quelques', 'années', ',', 'le', 'code', 'du', 'travail', 'a', 'presque', 'doublé', 'de', 'volume', ':', 'Au', 'fur', 'et', 'à', 'mesure', 'de', "l'", 'évolution', 'des', 'structures', 'des', 'entreprises', ',', 'de', 'nouvelles', 'notions', 'sont', 'apparues', 'telles', 'que', 'les', 'comités', 'de', 'groupe', '.']
normal token: En
normal token: l'
normal token: espace
normal token: de
normal token: quelques
normal token: années
normal token: ,
normal token: le
normal token: code
normal token: du
normal token: travail
normal token: a
normal token: presque
normal token: doublé
normal token: de
normal token: volume
normal token: :
normal token: Au
normal token: fur
normal token: et
normal token: à
normal token: mesure
normal token: de
normal token: l'
normal token: évolution
norma

normal token: la
normal token: signature
normal token: d'
normal token: un
normal token: contrat
normal token: commercial
normal token: peut
normal token: exiger
normal token: une
normal token: décision
normal token: rapide
normal token: qui
normal token: laisse
normal token: peu
normal token: de
normal token: temps
normal token: à
normal token: l'
normal token: épluchage
normal token: des
normal token: textes
normal token: .
['Par', 'exemple', ',', 'la', 'signature', "d'", 'un', 'contrat', 'commercial', 'peut', 'exiger', 'une', 'décision', 'rapide', 'qui', 'laisse', 'peu', 'de', 'temps', 'à', "l'", 'épluchage', 'des', 'textes', '.']
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['"', '<CONN>pour</CONN>', 'faire', 'face', 'à', 'ces', 'situations', ',', 'les', 'entreprises', 'jonglent', 'avec', 'plusieurs', 'systèmes', 'à', 'la', 'fois', '.']
normal token: "
single worded connector: <CONN>pour</CONN>
normal token: faire
normal token: face
normal token: à
n

normal token: à
normal token: la
normal token: mode
normal token: le
normal token: système
normal token: des
normal token: sociétés
normal token: en
normal token: commandite
normal token: par
normal token: actions
normal token: ,
normal token: qui
normal token: assure
normal token: l'
normal token: indépendance
normal token: du
normal token: gestionnaire
normal token: par
normal token: rapport
normal token: au
normal token: capital
normal token: .
['Le', 'bon', 'juriste', 'doit', 'être', 'créatif', 'et', 'préconiser', 'des', 'systèmes', 'qui', 'servent', 'les', 'intérêts', 'de', 'ses', 'clients', '"', ',', 'affirme', 'Me', 'Philippe', 'Ginestié', ',', 'qui', 'se', 'targue', "d'", 'avoir', 'remis', 'à', 'la', 'mode', 'le', 'système', 'des', 'sociétés', 'en', 'commandite', 'par', 'actions', ',', 'qui', 'assure', "l'", 'indépendance', 'du', 'gestionnaire', 'par', 'rapport', 'au', 'capital', '.']
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

normal token: l'
normal token: audience
normal token: est
normal token: maintenue
normal token: ,
normal token: la
normal token: cour
normal token: d'
normal token: appel
normal token: de
normal token: bordeaux
normal token: devrait
normal token: se
normal token: pencher
normal token: sur
normal token: la
normal token: nullité
normal token: de
normal token: l'
normal token: apport
normal token: partiel
normal token: d'
normal token: actifs
normal token: de
normal token: Rémy
normal token: Martin
normal token: à
normal token: Rémy
normal token: et
normal token: associés
normal token: .
['Le', '7', 'février', 'prochain', ',', 'si', "l'", 'audience', 'est', 'maintenue', ',', 'la', 'cour', "d'", 'appel', 'de', 'bordeaux', 'devrait', 'se', 'pencher', 'sur', 'la', 'nullité', 'de', "l'", 'apport', 'partiel', "d'", 'actifs', 'de', 'Rémy', 'Martin', 'à', 'Rémy', 'et', 'associés', '.']
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

normal token: neutralisation
normal token: de
normal token: l'
normal token: Allemagne
normal token: .
['Il', 'va', 'offrir', "l'", 'unification', 'en', 'échange', 'de', 'la', 'neutralisation', 'de', "l'", 'Allemagne', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Il', 'refuse', 'toute', 'unification', 'dans', 'le', 'cadre', 'de', "l'", 'OTAN', 'et', 'du', 'pacte', 'de', 'Varsovie', ',', '<CONN>mais</CONN>', 'il', 'serait', 'prêt', 'à', "l'", 'accepter', 'uniquement', '<CONN>dans', 'le', 'cas', 'où</CONN>', 'ces', 'alliances', 'disparaîtraient', '.', '.']
normal token: Il
normal token: refuse
normal token: toute
normal token: unification
normal token: dans
normal token: le
normal token: cadre
normal token: de
normal token: l'
normal token: OTAN
normal token: et
normal token: du
normal token: pacte
normal token: de
normal token: Varsovie
normal token: ,
single worded connector: <CONN>mais</CONN>
normal token: il
normal token: serait
normal token: prêt
normal token: à
normal token: 

normal token: orthodoxe
normal token: .
['Un', 'pas', 'en', 'avant', 'pour', 'deux', 'pas', 'en', 'arrière', 'est', 'léninisme', 'orthodoxe', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Les', 'Français', 'sont', 'beaucoup', 'plus', 'conscients', 'de', 'cette', 'menace', 'que', 'nous', ',', 'aux', 'Etats', '-', 'Unis', ',', '<CONN>et</CONN>', 'la', 'première', 'des', 'démarches', 'pour', 'nos', 'deux', 'pays', 'est', "d'", 'agir', 'de', 'concert', '<CONN>afin', 'de</CONN>', 'réussir', 'à', 'la', 'contrecarrer', '.']
normal token: Les
normal token: Français
normal token: sont
normal token: beaucoup
normal token: plus
normal token: conscients
normal token: de
normal token: cette
normal token: menace
normal token: que
normal token: nous
normal token: ,
normal token: aux
normal token: Etats
normal token: -
normal token: Unis
normal token: ,
single worded connector: <CONN>et</CONN>
normal token: la
normal token: première
normal token: des
normal token: démarches
normal token: pour
normal

['Une', 'tête', 'de', 'pont', "qu'", 'il', 'qualifie', 'de', '"', 'irremplaçable', '"', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Connaissance', 'et', 'appréciation', 'des', 'hommes', ',', 'mais', 'aussi', 'du', 'monde', 'des', 'transports', ':', '"', "qu'", 'il', "s'", 'agisse', "d'", 'avions', 'ou', "d'", 'un', 'système', 'de', 'transport', 'ferroviaire', 'dans', 'un', 'tunnel', ',', "l'", 'approche', 'reste', 'la', 'même', 'au', 'plan', 'commercial', '"', ',', 'affirme', 'sans', 'crainte', 'du', 'paradoxe', 'apparent', 'Nicolas', 'Gorodiche', ',', 'qui', 'a', 'introduit', 'entre', 'Paris', 'et', 'Londres', 'huit', 'liaisons', 'aériennes', 'nouvelles', '.']
normal token: Connaissance
normal token: et
normal token: appréciation
normal token: des
normal token: hommes
normal token: ,
normal token: mais
normal token: aussi
normal token: du
normal token: monde
normal token: des
normal token: transports
normal token: :
normal token: "
normal token: qu'
normal token: il
normal token: s'


normal token: les
normal token: professionnels
normal token: du
normal token: transport
normal token: routier
normal token: ,
normal token: y
normal token: compris
single worded connector: <CONN>en</CONN>
normal token: les
normal token: invitant
normal token: ,
single worded connector: <CONN>comme</CONN>
normal token: il
normal token: le
normal token: fait
normal token: ,
normal token: sur
normal token: les
normal token: terminaux
normal token: .
['Déjà', ',', 'il', "s'", 'agit', ',', 'en', 'les', 'rencontrant', ',', 'de', 'séduire', 'les', 'professionnels', 'du', 'transport', 'routier', ',', 'y', 'compris', 'en', 'les', 'invitant', ',', 'comme', 'il', 'le', 'fait', ',', 'sur', 'les', 'terminaux', '.']
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

['<CONN>Et</CONN><CONN>puis</CONN>', '-', 'on', 'assure', 'y', 'réfléchir', 'au', 'siège', 'de', 'la', 'société', ',', '-', 'il', 'y', 'a', 'les', '"', 'potentialités', 'toutes', 'neuves'

normal token: voir
normal token: les
normal token: émeutiers
normal token: maitres
normal token: des
normal token: rues
normal token: .
['Toutes', 'les', 'bribes', "d'", 'informations', 'laissent', "d'", 'ailleurs', 'voir', 'les', 'émeutiers', 'maitres', 'des', 'rues', '.']
[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]

['Un', 'millier', "d'", 'appartements', 'ont', 'été', 'pillés', '.']
normal token: Un
normal token: millier
normal token: d'
normal token: appartements
normal token: ont
normal token: été
normal token: pillés
normal token: .
['Un', 'millier', "d'", 'appartements', 'ont', 'été', 'pillés', '.']
[0, 0, 0, 0, 0, 0, 0, 0]

['Les', 'Izvestia', 'décrivent', 'des', 'groupes', 'de', 'dix', '-', 'quinze', 'hommes', 'arrêtant', 'des', 'camions', 'sous', 'la', 'menace', '<CONN>pour</CONN>', 'y', 'charger', 'leurs', 'sacs', 'de', 'butin', '.']
normal token: Les
normal token: Izvestia
normal token: décrivent
normal token: des
normal token: groupes
normal token: de
normal token: dix
n

normal token: Il
normal token: faut
normal token: ,
normal token: dit
normal token: -on
normal token: ,
normal token: former
normal token: 1
normal token: 600
normal token: 000
normal token: ou
normal token: 2
normal token: 000
normal token: 000
normal token: d'
normal token: étudiants
normal token: dans
normal token: les
normal token: années
normal token: 2000
normal token: .
['Il', 'faut', ',', 'dit', '-on', ',', 'former', '1', '600', '000', 'ou', '2', '000', '000', "d'", 'étudiants', 'dans', 'les', 'années', '2000', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['<CONN>Mais</CONN>', 'quels', 'sont', 'les', 'objectifs', 'et', 'les', 'niveaux', 'de', 'formation', '?']
single worded connector: <CONN>Mais</CONN>
normal token: quels
normal token: sont
normal token: les
normal token: objectifs
normal token: et
normal token: les
normal token: niveaux
normal token: de
normal token: formation
normal token: ?
['Mais', 'quels', 'sont', 'les', 'objectifs', 'et', 'les', '

normal token: présent
normal token: ,
normal token: eu
normal token: tendance
normal token: à
normal token: privilégier
normal token: l'
normal token: étude
normal token: de
normal token: leur
normal token: environnement
normal token: technique
normal token: et
normal token: économique
normal token: :
normal token: incidence
normal token: des
normal token: nouvelles
normal token: technologies
normal token: sur
normal token: les
normal token: rédactions
normal token: ,
normal token: poids
normal token: des
normal token: groupes
normal token: multimédias
normal token: sur
normal token: la
normal token: confection
normal token: de
normal token: l'
normal token: information
normal token: ...
['La', 'plupart', 'des', 'recherches', 'sur', 'les', 'entreprises', 'de', 'presse', 'ont', ',', "jusqu'", 'à', 'présent', ',', 'eu', 'tendance', 'à', 'privilégier', "l'", 'étude', 'de', 'leur', 'environnement', 'technique', 'et', 'économique', ':', 'incidence', 'des', 'nouvelles', 'technologies', 'sur'

['À', 'noter', 'cependant', "qu'", 'à', "l'", 'instabilité', 'des', 'équipes', 'dirigeantes', 'correspond', 'une', 'stabilité', 'étonnante', 'des', 'animateurs', '-', 'vedettes', ',', 'mais', 'que', "s'", 'instaure', 'ainsi', 'entre', 'eux', 'un', 'décalage', 'dangereux', 'à', "l'", 'origine', 'de', 'nouveaux', 'conflits', 'sur', "l'", 'exercice', 'effectif', 'du', 'pouvoir', '(7)', '.']
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['À', 'un', 'niveau', 'supérieur', '(', 'celui', 'de', 'la', 'séparation', 'des', 'pouvoirs', 'dans', "l'", 'état', ')', ',', 'de', 'tels', 'conflits', 'existent', '-ils', 'encore', '?']
normal token: À
normal token: un
normal token: niveau
normal token: supérieur
normal token: (
normal token: celui
normal token: de
normal token: la
normal token: séparation
normal token: des
normal token: pouvoirs
normal token: dans
normal token: l'
normal token: état
normal token: )
normal

normal token: dire
normal token: -
normal token: esthéticien
normal token: californien
normal token: qui
normal token: a
normal token: versé
normal token: 100
normal token: 000
normal token: dollars
normal token: d'
normal token: acompte
normal token: (
normal token: sur
normal token: 400
normal token: 000
normal token: )
normal token: ,
normal token: soit
normal token: environ
normal token: 570
normal token: 000
normal token: F
normal token: sur
normal token: 2
normal token: 300
normal token: 000
normal token: ,
normal token: à
normal token: un
normal token: garagiste
normal token: local
normal token: pour
normal token: la
normal token: commande
normal token: du
normal token: bolide
normal token: .
['Ainsi', 'cet', 'infortuné', '-', 'si', "l'on", 'peut', 'dire', '-', 'esthéticien', 'californien', 'qui', 'a', 'versé', '100', '000', 'dollars', "d'", 'acompte', '(', 'sur', '400', '000', ')', ',', 'soit', 'environ', '570', '000', 'F', 'sur', '2', '300', '000', ',', 'à', 'un', 'garagiste',

["Jusqu'", 'à', 'leur', 'installation', ',', 'au', 'printemps', 'prochain', ',', 'dans', 'un', 'bâtiment', 'en', 'cours', "d'", 'achèvement', ',', '15', ',', 'rue', 'Falguière', ',', 'près', 'de', 'la', 'gare', 'Montparnasse', ',', 'la', 'rédaction', ',', 'la', 'documentation', 'et', 'une', 'partie', 'des', 'services', 'de', 'fabrication', 'du', 'journal', 'restent', 'rue', 'des', 'Italiens', '.']
normal token: Jusqu'
normal token: à
normal token: leur
normal token: installation
normal token: ,
normal token: au
normal token: printemps
normal token: prochain
normal token: ,
normal token: dans
normal token: un
normal token: bâtiment
normal token: en
normal token: cours
normal token: d'
normal token: achèvement
normal token: ,
normal token: 15
normal token: ,
normal token: rue
normal token: Falguière
normal token: ,
normal token: près
normal token: de
normal token: la
normal token: gare
normal token: Montparnasse
normal token: ,
normal token: la
normal token: rédaction
normal token: ,
nor

['À', 'compter', 'de', 'la', 'fin', 'de', 'cette', 'semaine', ',', 'quelque', '80', '000', 'gendarmes', "d'", 'active', 'en', 'France', 'vont', 'être', 'appelés', 'à', 'choisir', ',', 'selon', 'des', 'procédures', 'originales', ',', 'les', 'membres', 'des', 'structures', 'de', 'participation', 'nouvellement', 'créées', 'pour', 'donner', 'un', 'avis', 'sur', 'les', 'conditions', 'de', 'vie', 'et', 'de', 'travail', 'dans', 'cette', 'arme', 'dont', 'la', 'gestion', 'relève', 'du', 'ministère', 'de', 'la', 'défense', '.']
normal token: À
normal token: compter
normal token: de
normal token: la
normal token: fin
normal token: de
normal token: cette
normal token: semaine
normal token: ,
normal token: quelque
normal token: 80
normal token: 000
normal token: gendarmes
normal token: d'
normal token: active
normal token: en
normal token: France
normal token: vont
normal token: être
normal token: appelés
normal token: à
normal token: choisir
normal token: ,
normal token: selon
normal token: des
no

normal token: l'
normal token: institut
normal token: d'
normal token: histoire
normal token: des
normal token: sciences
normal token: ,
normal token: à
normal token: Paris
normal token: .
['Entre', '1937', 'et', '1939', ',', 'il', 'travaille', 'à', "l'", 'institut', "d'", 'histoire', 'des', 'sciences', ',', 'à', 'Paris', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['En', '1940', ',', 'avec', 'sa', 'femme', 'Fanny', 'et', 'leur', 'fils', 'Uri', ',', 'il', 'quitte', 'Marseille', 'pour', 'Jérusalem', ',', 'par', 'le', 'dernier', 'bateau', '.']
normal token: En
normal token: 1940
normal token: ,
normal token: avec
normal token: sa
normal token: femme
normal token: Fanny
normal token: et
normal token: leur
normal token: fils
normal token: Uri
normal token: ,
normal token: il
normal token: quitte
normal token: Marseille
normal token: pour
normal token: Jérusalem
normal token: ,
normal token: par
normal token: le
normal token: dernier
normal token: bateau
normal token: .
['E

normal token: Bulgarie
normal token: ,
normal token: en
normal token: particulier
normal token: du
normal token: multipartisme
normal token: ,
normal token: de
normal token: la
normal token: séparation
normal token: effective
normal token: des
normal token: fonctions
normal token: du
normal token: Parti
normal token: communiste
normal token: et
normal token: de
normal token: l'
normal token: état
normal token: et
normal token: de
normal token: la
normal token: dissolution
normal token: des
normal token: cellules
normal token: du
normal token: PC
normal token: dans
normal token: les
normal token: administrations
normal token: et
normal token: les
normal token: entreprises
normal token: .
['Initialement', ',', 'les', 'deux', 'délégations', 'auraient', 'dû', 'discuter', 'mardi', 'du', 'système', 'politique', 'en', 'Bulgarie', ',', 'en', 'particulier', 'du', 'multipartisme', ',', 'de', 'la', 'séparation', 'effective', 'des', 'fonctions', 'du', 'Parti', 'communiste', 'et', 'de', "l'", 'état

normal token: agroalimentaire
normal token: britannique
normal token: et
normal token: le
normal token: troisième
normal token: producteur
normal token: français
normal token: de
normal token: cognac
normal token: représentera
normal token: un
normal token: chiffre
normal token: d'
normal token: affaires
normal token: de
normal token: plus
normal token: de
normal token: 6
normal token: milliards
normal token: de
normal token: francs
normal token: .
["L'", 'entité', 'qui', 'résultera', 'de', 'cette', 'fusion', 'entre', 'le', 'géant', 'de', "l'", 'agroalimentaire', 'britannique', 'et', 'le', 'troisième', 'producteur', 'français', 'de', 'cognac', 'représentera', 'un', 'chiffre', "d'", 'affaires', 'de', 'plus', 'de', '6', 'milliards', 'de', 'francs', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['<CONN>Mais</CONN>', 'cet', 'accord', 'permet', 'enfin', 'à', 'Grand', 'Met', ',', 'qui', 'avait', 'échoué', 'en', '1988', 'dans', 's

normal token: "
normal token: entamée
normal token: depuis
normal token: la
normal token: mi
normal token: -
normal token: décembre
normal token: à
normal token: la
normal token: BNP
normal token: ,
normal token: où
normal token: la
normal token: situation
normal token: est
normal token: toujours
normal token: bloquée
normal token: .
['Malgré', 'leur', 'désaccord', ',', 'FO', 'et', 'la', 'CFDT', 'se', 'mobiliseront', 'ensemble', 'dans', 'le', 'secteur', 'bancaire', ',', 'le', '24', 'janvier', ',', 'pour', '"', 'élargir', "l'", 'action', '"', 'entamée', 'depuis', 'la', 'mi', '-', 'décembre', 'à', 'la', 'BNP', ',', 'où', 'la', 'situation', 'est', 'toujours', 'bloquée', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Le', 'Club', 'Méditerranée', ',', 'numéro', 'un', 'mondial', 'des', 'villages', 'de', 'vacances', ',', 'a', 'réalisé', ',', 'en', '1989', ',', 'un', 'bénéfice', 'consolidé', 'de', '408',

normal token: Mr
normal token: Yoshua
normal token: Amishav
normal token: ,
normal token: porte
normal token: -
normal token: parole
normal token: ,
normal token: "
normal token: ont
normal token: été
normal token: réprimandés
normal token: par
normal token: le
normal token: directeur
normal token: général
normal token: du
normal token: ministère
normal token: des
normal token: affaires
normal token: étrangères
normal token: pour
normal token: avoir
normal token: agi
normal token: contrairement
normal token: aux
normal token: règles
normal token: de
normal token: bonne
normal token: administration
normal token: et
normal token: aux
normal token: règlements
normal token: du
normal token: service
normal token: public
normal token: d'
normal token: Israël
normal token: ,
normal token: ainsi
normal token: que
normal token: négligé
normal token: leurs
normal token: fonctions
normal token: et
normal token: leur
normal token: rang
normal token: "
normal token: .
['Répondant', 'à', 'une', 'que

normal token: moitié
normal token: des
normal token: doses
normal token: d'
normal token: AZT
normal token: ,
normal token: un
normal token: médicament
normal token: antiviral
normal token: prescrit
normal token: dans
normal token: le
normal token: traitement
normal token: du
normal token: sida
normal token: ,
multi worded connector: <CONN>afin
multi worded connector: d'</CONN>
normal token: en
normal token: réduire
normal token: les
normal token: effets
normal token: secondaires
normal token: et
normal token: le
normal token: coût
normal token: .
['La', 'Food', 'and', 'drug', 'administration', '(', 'FDA', ')', 'américaine', 'a', 'décidé', ',', 'mardi', '16', 'janvier', ',', 'de', 'recommander', 'la', 'diminution', 'de', 'moitié', 'des', 'doses', "d'", 'AZT', ',', 'un', 'médicament', 'antiviral', 'prescrit', 'dans', 'le', 'traitement', 'du', 'sida', ',', 'afin', "d'", 'en', 'réduire', 'les', 'effets', 'secondaires', 'et', 'le', 'coût', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

normal token: des
normal token: états
normal token: membres
single worded connector: <CONN>et</CONN>
normal token: obligeront
normal token: ceux
normal token: -
normal token: ci
normal token: à
normal token: respecter
normal token: un
normal token: minimum
normal token: de
normal token: "
normal token: bonne
normal token: tenue
normal token: "
normal token: ,
normal token: surtout
normal token: en
normal token: matière
normal token: de
normal token: politique
normal token: budgétaire
normal token: .
['Dans', 'les', 'secteurs', 'importants', ',', 'des', 'dispositions', 'obligatoires', 'complémentaires', 'se', 'révèleront', 'donc', 'nécessaires', ',', 'qui', 'gêneront', 'ou', 'empêcheront', 'les', 'attitudes', 'individualistes', 'des', 'états', 'membres', 'et', 'obligeront', 'ceux', '-', 'ci', 'à', 'respecter', 'un', 'minimum', 'de', '"', 'bonne', 'tenue', '"', ',', 'surtout', 'en', 'matière', 'de', 'politique', 'budgétaire', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,

normal token: masse
normal token: monétaire
normal token: seulement
normal token: quelques
normal token: mois
normal token: après
normal token: l'
normal token: entrée
normal token: en
normal token: vigueur
normal token: de
normal token: ces
normal token: mesures
normal token: ,
single worded connector: <CONN>et</CONN>
normal token: les
normal token: variations
normal token: de
normal token: la
normal token: masse
normal token: monétaire
normal token: n'
normal token: agissent
normal token: que
normal token: quelques
normal token: trimestres
normal token: plus
normal token: tard
normal token: sur
normal token: l'
normal token: évolution
normal token: des
normal token: prix
normal token: .
['Les', 'interventions', 'sur', 'les', 'taux', "d'", 'intérêt', 'exercent', 'leur', 'effet', 'sur', 'la', 'masse', 'monétaire', 'seulement', 'quelques', 'mois', 'après', "l'", 'entrée', 'en', 'vigueur', 'de', 'ces', 'mesures', ',', 'et', 'les', 'variations', 'de', 'la', 'masse', 'monétaire', "n'", 'ag

normal token: Bundesbank
normal token: .
['La', 'politique', 'monétaire', 'est', 'indivisible', ',', 'même', 'dans', 'un', 'système', 'fédéral', 'comme', 'celui', 'de', 'la', 'Fed', 'ou', 'de', 'la', 'Bundesbank', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Sans', 'ce', 'monopole', 'le', 'système', 'européen', 'de', 'banques', 'centrales', 'serait', 'un', '"', 'lion', 'sans', 'griffes', '"', '.']
normal token: Sans
normal token: ce
normal token: monopole
normal token: le
normal token: système
normal token: européen
normal token: de
normal token: banques
normal token: centrales
normal token: serait
normal token: un
normal token: "
normal token: lion
normal token: sans
normal token: griffes
normal token: "
normal token: .
['Sans', 'ce', 'monopole', 'le', 'système', 'européen', 'de', 'banques', 'centrales', 'serait', 'un', '"', 'lion', 'sans', 'griffes', '"', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Un', 'système', 'qui', 'ne', 'sert', "qu'",

normal token: partie
normal token: de
normal token: leur
normal token: souveraineté
normal token: ,
single worded connector: <CONN>comme</CONN>
normal token: ils
normal token: l'
normal token: ont
normal token: déjà
normal token: fait
normal token: dans
normal token: d'
normal token: autres
normal token: domaines
normal token: ,
normal token: moins
normal token: importants
normal token: il
normal token: est
normal token: vrai
normal token: .
['Mais', 'il', 'est', 'très', 'concevable', 'à', 'mon', 'avis', "qu'", 'un', 'système', 'européen', 'de', 'banques', 'centrales', 'indépendant', 'existe', 'déjà', 'avant', 'que', 'ne', 'soit', 'parachevée', "l'", 'union', 'politique', ',', 'à', 'condition', 'que', 'les', 'gouvernements', 'le', 'veuillent', 'et', "qu'", 'ils', 'soient', 'prêts', 'à', 'abandonner', 'une', 'partie', 'de', 'leur', 'souveraineté', ',', 'comme', 'ils', "l'", 'ont', 'déjà', 'fait', 'dans', "d'", 'autres', 'domaines', ',', 'moins', 'importants', 'il', 'est', 'vrai', '.']
[

normal token: conférence
normal token: intergouvernementale
normal token: ,
normal token: dont
normal token: les
normal token: travaux
normal token: dureront
normal token: probablement
normal token: un
normal token: certain
normal token: temps
normal token: .
['Dans', 'ce', 'cadre', ',', 'on', 'peut', 'et', 'on', 'doit', 'parvenir', 'dès', "aujourd'", 'hui', 'à', 'une', 'meilleure', 'coordination', 'des', 'politiques', 'économiques', ',', 'budgétaires', 'et', 'monétaires', ',', 'sans', 'attendre', 'les', 'résultats', 'de', 'la', 'conférence', 'intergouvernementale', ',', 'dont', 'les', 'travaux', 'dureront', 'probablement', 'un', 'certain', 'temps', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['En', 'ce', 'qui', 'concerne', 'la', 'politique', 'monétaire', ',', 'le', 'comité', 'des', 'gouverneurs', 'des', 'banques', 'centrales', 'a', 'décidé', ',', 'lors', 'de', 'sa', 'réunion', 'en', 'janvier', 

normal token: plan
normal token: de
normal token: développement
normal token: de
normal token: l'
normal token: enseignement
normal token: supérieur
normal token: ,
normal token: sur
normal token: la
normal token: part
normal token: que
normal token: doivent
normal token: prendre
normal token: les
normal token: collectivités
normal token: locales
normal token: dans
normal token: les
normal token: décisions
normal token: et
normal token: les
normal token: investissements
normal token: .
['Mr', 'Jospin', 'a', 'insisté', ',', 'en', 'présentant', 'son', 'plan', 'de', 'développement', 'de', "l'", 'enseignement', 'supérieur', ',', 'sur', 'la', 'part', 'que', 'doivent', 'prendre', 'les', 'collectivités', 'locales', 'dans', 'les', 'décisions', 'et', 'les', 'investissements', '.']
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['<CONN>Et</CONN>', 'le', 'gouvernement', 'a', 'décidé', "d'", 'accorder', 'la', 'maîtrise', "d'", 'ouvrage', 'aux', 'c

normal token: de
normal token: l'
normal token: outil
normal token: de
normal token: dissuasion
normal token: "
normal token: ,
normal token: avec
normal token: la
normal token: présence
normal token: à
normal token: l'
normal token: île
normal token: -
normal token: Longue
normal token: des
normal token: sous
normal token: -
normal token: marins
normal token: stratégiques
normal token: ,
single worded connector: <CONN>et</CONN>
normal token: elle
normal token: a
normal token: participé
single worded connector: <CONN>néanmoins</CONN>
normal token: ,
normal token: en
normal token: 1989
normal token: ,
normal token: à
normal token: toutes
normal token: les
normal token: opérations
normal token: menées
normal token: hors
normal token: du
normal token: territoire
normal token: national
normal token: "
normal token: où
normal token: que
normal token: ce
normal token: soit
normal token: ,
normal token: y
normal token: compris
normal token: à
normal token: terre
normal token: "
normal token: 

normal token: le
normal token: précédent
normal token: .
['Il', 'est', 'plus', 'modeste', 'et', 'plus', 'axé', 'sur', 'les', 'loisirs', 'que', 'le', 'précédent', '.']
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Arthur', 'Wilmans', 'se', 'montre', 'ambitieux', ':', '"', 'Hagetmau', 'sera', 'un', 'paradis', 'pour', 'les', 'crocodiles', '"', ',', 'affirme', '-t-il', '.']
normal token: Arthur
normal token: Wilmans
normal token: se
normal token: montre
normal token: ambitieux
normal token: :
normal token: "
normal token: Hagetmau
normal token: sera
normal token: un
normal token: paradis
normal token: pour
normal token: les
normal token: crocodiles
normal token: "
normal token: ,
normal token: affirme
normal token: -t-il
normal token: .
['Arthur', 'Wilmans', 'se', 'montre', 'ambitieux', ':', '"', 'Hagetmau', 'sera', 'un', 'paradis', 'pour', 'les', 'crocodiles', '"', ',', 'affirme', '-t-il', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Il', 'escompte', 'plus', 'de', 'vi

normal token: apparemment
normal token: pas
normal token: le
normal token: cas
normal token: .
['Mais', 'ce', 'ne', 'sera', 'apparemment', 'pas', 'le', 'cas', '.']
[1, 0, 0, 0, 0, 0, 0, 0, 0]

['Mme', 'Neiertz', 'a', '<CONN>en', 'effet</CONN>', 'assuré', 'à', 'cette', 'occasion', 'que', 'les', 'règles', "d'", 'attribution', 'des', 'stations', 'autoroutières', 'devraient', 'être', 'bientôt', 'modifiées', '<CONN>de', 'façon', 'à</CONN>', 'permettre', 'à', 'tous', 'les', 'opérateurs', '(', 'raffineurs', ',', 'distributeurs', 'indépendants', ',', 'grandes', 'surfaces', ')', 'de', 'postuler', '.']
normal token: Mme
normal token: Neiertz
normal token: a
multi worded connector: <CONN>en
multi worded connector: effet</CONN>
normal token: assuré
normal token: à
normal token: cette
normal token: occasion
normal token: que
normal token: les
normal token: règles
normal token: d'
normal token: attribution
normal token: des
normal token: stations
normal token: autoroutières
normal token: devraient
n

normal token: la
normal token: capitalisation
normal token: boursière
normal token: ,
normal token: qui
normal token: passe
normal token: de
normal token: 23
normal token: ,
normal token: 2
normal token: milliards
normal token: de
normal token: francs
normal token: en
normal token: 1988
normal token: à
normal token: 31
normal token: ,
normal token: 8
normal token: milliards
normal token: en
normal token: 1989
normal token: ,
normal token: dont
normal token: 20
normal token: ,
normal token: 2
normal token: milliards
normal token: pour
normal token: les
normal token: seules
normal token: actions
normal token: (
normal token: +
normal token: 78
normal token: ,
normal token: 6
normal token: %
normal token: )
normal token: .
['Il', 'convient', 'également', 'de', 'noter', "l'", 'importante', 'progression', 'de', 'la', 'capitalisation', 'boursière', ',', 'qui', 'passe', 'de', '23', ',', '2', 'milliards', 'de', 'francs', 'en', '1988', 'à', '31', ',', '8', 'milliards', 'en', '1989', ',', 'dont'

normal token: à
normal token: proposer
normal token: ,
normal token: c'
normal token: est
normal token: de
normal token: mettre
normal token: au
normal token: service
normal token: de
normal token: la
normal token: paix
normal token: les
normal token: alliances
normal token: de
normal token: la
normal token: guerre
normal token: froide
normal token: .
['Moyennant', 'quoi', ',', 'tout', 'ce', 'que', 'son', 'maître', 'George', 'Bush', 'a', 'trouvé', 'à', 'proposer', ',', "c'", 'est', 'de', 'mettre', 'au', 'service', 'de', 'la', 'paix', 'les', 'alliances', 'de', 'la', 'guerre', 'froide', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Gorbatchev', ',', 'à', 'Malte', ',', 'a', 'sauté', 'sur', "l'", 'occasion', '.']
normal token: Gorbatchev
normal token: ,
normal token: à
normal token: Malte
normal token: ,
normal token: a
normal token: sauté
normal token: sur
normal token: l'
normal token: occasion
normal token: .
['Gorbatchev', ',', 'à

['Autrement', 'dit', ',', 'que', 'le', 'degré', 'de', 'contrainte', 'résultant', 'pour', 'un', 'Etat', 'de', 'son', 'appartenance', 'à', 'une', 'confédération', 'est', 'infiniment', 'moindre', 'que', 'celui', "qu'", 'implique', "l'", 'entrée', 'dans', 'une', 'fédération', '.']
normal token: Autrement
normal token: dit
normal token: ,
normal token: que
normal token: le
normal token: degré
normal token: de
normal token: contrainte
normal token: résultant
normal token: pour
normal token: un
normal token: Etat
normal token: de
normal token: son
normal token: appartenance
normal token: à
normal token: une
normal token: confédération
normal token: est
normal token: infiniment
normal token: moindre
normal token: que
normal token: celui
normal token: qu'
normal token: implique
normal token: l'
normal token: entrée
normal token: dans
normal token: une
normal token: fédération
normal token: .
['Autrement', 'dit', ',', 'que', 'le', 'degré', 'de', 'contrainte', 'résultant', 'pour', 'un', 'Etat', '

[0, 0, 0, 0, 0, 0, 0, 0, 0]

['Elle', 'a', 'rempli', 'des', 'fiches', 'et', 'des', 'papiers', '.']
normal token: Elle
normal token: a
normal token: rempli
normal token: des
normal token: fiches
normal token: et
normal token: des
normal token: papiers
normal token: .
['Elle', 'a', 'rempli', 'des', 'fiches', 'et', 'des', 'papiers', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0]

['Elle', 'a', 'consulté', 'un', 'tas', "d'", 'annuaires', 'et', 'de', 'bouquins', ',', '<CONN>histoire', 'de</CONN>', 'vérifier', 'le', 'tour', 'de', 'taille', ',', 'le', 'numéro', 'de', 'sécurité', 'sociale', 'et', 'le', 'nom', 'de', 'jeune', 'fille', 'de', 'la', 'destinataire', '.']
normal token: Elle
normal token: a
normal token: consulté
normal token: un
normal token: tas
normal token: d'
normal token: annuaires
normal token: et
normal token: de
normal token: bouquins
normal token: ,
multi worded connector: <CONN>histoire
multi worded connector: de</CONN>
normal token: vérifier
normal token: le
normal token: tour
normal to

normal token: Jacky
normal token: Ickx
normal token: ,
normal token: lui
normal token: ,
normal token: a
normal token: trouvé
normal token: une
normal token: faille
normal token: non
normal token: inscrite
normal token: sur
normal token: les
normal token: cartes
normal token: .
['Mais', 'le', 'Belge', 'Jacky', 'Ickx', ',', 'lui', ',', 'a', 'trouvé', 'une', 'faille', 'non', 'inscrite', 'sur', 'les', 'cartes', '.']
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Nous', 'avons', 'réussi', 'à', 'glisser', 'la', 'Lada', '-', 'Poch', 'dans', 'une', 'saignée', 'de', '3', 'mètres', 'de', 'large', ',', 'raconte', 'Christian', 'Tarin', '.']
normal token: Nous
normal token: avons
normal token: réussi
normal token: à
normal token: glisser
normal token: la
normal token: Lada
normal token: -
normal token: Poch
normal token: dans
normal token: une
normal token: saignée
normal token: de
normal token: 3
normal token: mètres
normal token: de
normal token: large
normal token: ,
normal token: ra

['Les', 'autorités', 'municipales', 'de', 'Saragosse', ',', 'ont', 'décrété', 'un', 'deuil', 'officiel', 'de', 'trois', 'jours', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

["L'", 'éditeur', 'André', 'Balland', 'vient', 'de', 'quitter', 'la', 'maison', "d'", 'édition', 'du', 'même', 'nom', "qu'", 'il', 'avait', 'créée', 'en', '1967', '<CONN>pour</CONN>', 'se', 'consacrer', 'désormais', 'à', 'la', 'production', 'cinématographique', 'via', 'la', 'société', 'DB', 'Films', 'fondée', 'par', 'son', 'épouse', '.']
normal token: L'
normal token: éditeur
normal token: André
normal token: Balland
normal token: vient
normal token: de
normal token: quitter
normal token: la
normal token: maison
normal token: d'
normal token: édition
normal token: du
normal token: même
normal token: nom
normal token: qu'
normal token: il
normal token: avait
normal token: créée
normal token: en
normal token: 1967
single worded connector: <CONN>pour</CONN>
normal token: se
normal token: consacrer
normal token:

normal token: des
normal token: scènes
normal token: de
normal token: films
normal token: anciens
normal token: que
normal token: les
normal token: héros
normal token: regardent
normal token: au
normal token: cinéma
normal token: ou
normal token: à
normal token: la
normal token: télévision
normal token: .
['Il', 'le', 'fait', 'en', 'cinéphile', ',', 'en', 'insérant', '(', 'ce', 'qui', 'commence', 'à', 'se', 'faire', 'trop', 'souvent', ')', 'des', 'scènes', 'de', 'films', 'anciens', 'que', 'les', 'héros', 'regardent', 'au', 'cinéma', 'ou', 'à', 'la', 'télévision', '.']
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Par', 'maladresse', 'ou', 'délibérément', ',', 'il', 'enchaîne', 'des', 'séquences', 'statiques', '<CONN>et</CONN>', 'laisse', 'de', 'longs', 'silences', 'entre', 'les', 'répliques', ',', 'comme', 'au', 'début', 'du', 'parlant', '.']
normal token: Par
normal token: maladresse
normal token: ou
normal token: délibérémen

normal token: pays
normal token: réunies
normal token: à
normal token: Santa
normal token: Cruz
normal token: ,
normal token: en
normal token: Bolivie
normal token: ,
normal token: sera
normal token: annoncé
normal token: officiellement
normal token: lors
normal token: du
normal token: sommet
normal token: qui
normal token: réunira
normal token: ,
normal token: le
normal token: 15
normal token: février
normal token: à
normal token: Cartagena
normal token: ,
normal token: en
normal token: Colombie
normal token: ,
normal token: le
normal token: président
normal token: George
normal token: Bush
normal token: et
normal token: les
normal token: chefs
normal token: d'
normal token: état
normal token: des
normal token: trois
normal token: pays
normal token: producteurs
normal token: de
normal token: cocaine
normal token: .
["L'", 'accord', ',', 'mis', 'au', 'point', 'par', 'des', 'délégations', 'des', 'quatre', 'pays', 'réunies', 'à', 'Santa', 'Cruz', ',', 'en', 'Bolivie', ',', 'sera', 'annon

normal token: gouvernement
normal token: israélien
normal token: souhaite
normal token: que
normal token: l'
normal token: administration
normal token: américaine
normal token: lui
normal token: garantisse
normal token: une
normal token: série
normal token: de
normal token: prêts
normal token: bancaires
normal token: -
normal token: 400
normal token: à
normal token: 500
normal token: millions
normal token: de
normal token: dollars
normal token: -
normal token: destinés
normal token: à
normal token: financer
normal token: des
normal token: logements
normal token: pour
normal token: les
normal token: Juifs
normal token: soviétiques
normal token: .
['La', 'déclaration', 'de', 'Mr', 'Shamir', 'intervient', 'au', 'moment', 'où', 'le', 'gouvernement', 'israélien', 'souhaite', 'que', "l'", 'administration', 'américaine', 'lui', 'garantisse', 'une', 'série', 'de', 'prêts', 'bancaires', '-', '400', 'à', '500', 'millions', 'de', 'dollars', '-', 'destinés', 'à', 'financer', 'des', 'logements', 'p

normal token: nous
normal token: vivons
normal token: .
['Tous', 'appellent', 'une', 'nouvelle', 'civilisation', 'pleinement', 'humaine', ',', 'en', 'cette', 'heure', 'privilégiée', 'que', 'nous', 'vivons', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Cet', 'immense', 'espoir', 'de', "l'", 'humanité', 'ne', 'doit', 'pas', 'être', 'déçu', '.']
normal token: Cet
normal token: immense
normal token: espoir
normal token: de
normal token: l'
normal token: humanité
normal token: ne
normal token: doit
normal token: pas
normal token: être
normal token: déçu
normal token: .
['Cet', 'immense', 'espoir', 'de', "l'", 'humanité', 'ne', 'doit', 'pas', 'être', 'déçu', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['"', 'Présentant', 'ses', 'voeux', 'à', 'la', 'presse', 'vendredi', '12', 'janvier', ',', 'Mr', 'Georges', 'Sarre', ',', 'secrétaire', "d'", 'état', 'aux', 'transports', 'terrestres', 'et', 'fluviaux', 'et', 'président', 'du', 'groupe', 'socialiste', 'au', 'Conseil', 'de', 'Paris', 

normal token: 1971
normal token: à
normal token: la
normal token: tête
normal token: du
normal token: parti
normal token: communiste
normal token: (
normal token: SED
normal token: )
normal token: ,
normal token: avait
normal token: été
normal token: contraint
normal token: de
normal token: démissionner
normal token: de
normal token: ses
normal token: fonctions
normal token: le
normal token: 18
normal token: octobre
normal token: 1989
normal token: .
['Mr', 'Honecker', ',', 'qui', 'avait', 'succédé', 'à', 'Walter', 'Ulbricht', 'en', '1971', 'à', 'la', 'tête', 'du', 'parti', 'communiste', '(', 'SED', ')', ',', 'avait', 'été', 'contraint', 'de', 'démissionner', 'de', 'ses', 'fonctions', 'le', '18', 'octobre', '1989', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Début', 'décembre', ',', 'il', 'avait', 'été', 'assigné', 'à', 'résidence', 'pour', '"', 'corruption', 'et', 'abus', 'de', 'pouvoir', '"', '.']
normal token: Début


normal token: presse
normal token: en
normal token: Espagne
normal token: ,
normal token: SGEL
normal token: ,
normal token: la
normal token: deuxième
normal token: société
normal token: du
normal token: secteur
normal token: ,
normal token: et
normal token: une
normal token: régie
normal token: publicitaire
normal token: ,
normal token: Hachette
normal token: Interdeco
normal token: SA
normal token: ,
normal token: créée
normal token: en
normal token: novembre
normal token: 1989
normal token: par
normal token: le
normal token: groupe
normal token: de
normal token: presse
normal token: (
normal token: 63
normal token: %
normal token: )
normal token: et
normal token: la
normal token: régie
normal token: Interdeco
normal token: (
normal token: 27
normal token: %
normal token: )
normal token: .
['Mais', 'Hachette', 'a', 'aussi', 'une', 'filiale', 'distribution', 'de', 'presse', 'en', 'Espagne', ',', 'SGEL', ',', 'la', 'deuxième', 'société', 'du', 'secteur', ',', 'et', 'une', 'régie', 'pub

normal token: la
normal token: réunion
normal token: des
normal token: représentants
normal token: des
normal token: cinq
normal token: membres
normal token: permanents
normal token: du
normal token: Conseil
normal token: de
normal token: sécurité
normal token: des
normal token: Nations
normal token: unies
normal token: sur
normal token: le
normal token: Cambodge
normal token: ,
normal token: interviennent
normal token: peu
normal token: de
normal token: temps
normal token: après
normal token: que
normal token: Mr
normal token: Deng
normal token: Xiaoping
normal token: eut
normal token: sévèrement
normal token: critiqué
normal token: ,
normal token: dans
normal token: des
normal token: documents
normal token: internes
normal token: ,
normal token: la
normal token: politique
normal token: de
normal token: Mr
normal token: Gorbatchev
normal token: .
['Les', 'propos', 'du', 'vice-', 'ministre', 'soviétique', ',', 'qui', 'participait', 'lundi', 'à', 'Paris', 'à', 'la', 'réunion', 'des', 'r

['Trois', 'petits', 'délinquants', 'masqués', ',', 'armés', 'seulement', 'de', 'jouets', 'en', 'plastique', 'imitant', 'des', 'armes', 'à', 'feu', ',', 'ont', 'été', 'tués', ',', 'samedi', '13', 'janvier', ',', 'par', 'un', 'commando', 'de', 'tireurs', "d'", 'élite', 'de', "l'", 'armée', 'britannique', 'en', 'plein', 'coeur', 'du', 'quartier', 'catholique', 'de', 'Belfast', ',', 'en', 'Irlande', 'du', 'Nord', '.']
normal token: Trois
normal token: petits
normal token: délinquants
normal token: masqués
normal token: ,
normal token: armés
normal token: seulement
normal token: de
normal token: jouets
normal token: en
normal token: plastique
normal token: imitant
normal token: des
normal token: armes
normal token: à
normal token: feu
normal token: ,
normal token: ont
normal token: été
normal token: tués
normal token: ,
normal token: samedi
normal token: 13
normal token: janvier
normal token: ,
normal token: par
normal token: un
normal token: commando
normal token: de
normal token: tireurs


normal token: .
['Le', '13', 'janvier', ',', 'les', 'forces', 'de', "l'", 'ordre', 'étaient', 'intervenues', 'une', 'seconde', 'fois', 'pour', 'disperser', 'un', 'rassemblement', 'des', 'mêmes', 'étudiants', ',', 'qui', 'bloquaient', 'la', 'route', 'menant', "d'", 'Oran', 'à', "l'", 'aéroport', "d'", 'ès', '-', 'Sénia', ',', 'à', 'proximité', 'de', 'leur', 'école', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Plusieurs', 'personnes', 'furent', 'blessés', 'lors', 'de', 'cette', 'nouvelle', 'intervention', '.']
normal token: Plusieurs
normal token: personnes
normal token: furent
normal token: blessés
normal token: lors
normal token: de
normal token: cette
normal token: nouvelle
normal token: intervention
normal token: .
['Plusieurs', 'personnes', 'furent', 'blessés', 'lors', 'de', 'cette', 'nouvelle', 'intervention', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Les', 'autorités', 'ont', 'ordonné', 'la'

normal token: ne
normal token: seront
normal token: pas
normal token: perceptibles
normal token: "
normal token: avant
normal token: plusieurs
normal token: jours
normal token: "
normal token: ,
multi worded connector: <CONN>étant
large multi worded connector: donné
multi worded connector: que</CONN>
normal token: le
normal token: cours
normal token: d'
normal token: eau
normal token: traverse
normal token: plus
normal token: d'
normal token: un
normal token: millier
normal token: de
normal token: kilomètres
multi worded connector: <CONN>avant
multi worded connector: de</CONN>
normal token: pénétrer
normal token: en
normal token: territoire
normal token: irakien
normal token: .
['On', 'indique', 'que', 'les', 'conséquences', 'de', "l'", 'interruption', 'du', 'débit', 'fluvial', ',', 'qui', 'touchera', 'quelque', '1', ',', '5', 'million', "d'", 'agriculteurs', 'irakiens', 'utilisant', 'les', 'eaux', 'de', "l'", 'Euphrate', ',', 'ne', 'seront', 'pas', 'perceptibles', '"', 'avant', 'plusi

normal token: le
normal token: limogeage
normal token: de
normal token: Mr
normal token: Mazilu
normal token: du
normal token: CFSN
normal token: pour
normal token: "
normal token: atteinte
normal token: grave
normal token: à
normal token: la
normal token: politique
normal token: de
normal token: consensus
normal token: national
normal token: "
normal token: .
['Le', 'quotidien', 'Romania', 'Libera', 'a', 'ouvert', 'les', 'hostilités', 'samedi', 'en', 'réclamant', ',', 'en', 'première', 'page', ',', 'le', 'limogeage', 'de', 'Mr', 'Mazilu', 'du', 'CFSN', 'pour', '"', 'atteinte', 'grave', 'à', 'la', 'politique', 'de', 'consensus', 'national', '"', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

["L'", 'ancien', 'rapporteur', 'de', 'la', 'commission', 'des', 'droits', 'de', "l'", 'homme', 'des', 'nations', 'unies', ',', 'qui', 'a', 'eu', 'les', 'pires', 'ennuis', 'pour', 'avoir', 'tenté', "d'", 'en', 'dénoncer', 'les', 'viola

normal token: ,
normal token: Valentin
normal token: Ceausescu
normal token: ,
normal token: fils
normal token: ainé
normal token: de
normal token: l'
normal token: ancien
normal token: dictateur
normal token: ,
normal token: a
normal token: été
normal token: présenté
normal token: samedi
normal token: ,
normal token: menottes
normal token: aux
normal token: mains
normal token: ,
normal token: à
normal token: la
normal token: télévision
normal token: roumaine
normal token: ,
multi worded connector: <CONN>de
large multi worded connector: même
multi worded connector: que</CONN>
normal token: plusieurs
normal token: proches
normal token: collaborateurs
normal token: de
normal token: son
normal token: père
normal token: .
['Par', 'ailleurs', ',', 'Valentin', 'Ceausescu', ',', 'fils', 'ainé', 'de', "l'", 'ancien', 'dictateur', ',', 'a', 'été', 'présenté', 'samedi', ',', 'menottes', 'aux', 'mains', ',', 'à', 'la', 'télévision', 'roumaine', ',', 'de', 'même', 'que', 'plusieurs', 'proches', 'c

normal token: à
normal token: ce
normal token: jour
normal token: ,
normal token: et
normal token: parmi
normal token: eux
normal token: Pierre
normal token: Durand
normal token: .
['Ils', 'sont', 'une', 'dizaine', 'à', "l'", 'avoir', 'fait', 'à', 'ce', 'jour', ',', 'et', 'parmi', 'eux', 'Pierre', 'Durand', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['Narcotique', ',', 'la', 'jument', 'du', 'champion', 'olympique', ',', 'appelée', 'sans', 'doute', 'à', 'succéder', 'un', 'jour', 'à', 'Jappeloup', ',', 'se', 'trouve', 'depuis', 'la', 'fin', 'octobre', 'au', 'haras', 'de', 'la', 'Gisloterie', ',', 'situé', 'en', 'plein', 'coeur', 'du', 'Calvados', ',', 'à', 'Sainte', '-', 'Marie', '-', "d'", 'elle', ',', '<CONN>pour</CONN>', 'y', 'subir', 'un', 'transfert', "d'", 'embryon', '.']
normal token: Narcotique
normal token: ,
normal token: la
normal token: jument
normal token: du
normal token: champion
normal token: olympique
normal token: ,
normal token: appelée
normal token: 

AttributeError: 'NoneType' object has no attribute 'group'

# Bracketing Annotation Parsing

## [ ] Tagging 

In [338]:
def tagging_connective_tokens(tokens_list, id_lists):
    tagged_tokens_list = []
    for i, id_list in enumerate(id_lists):
        tagged_tokens = []
        for j, id_ in enumerate(id_list):
            if id_ != 0:
                new_token = "[" + tokens_list[i][j] + "]"
                tagged_tokens.append(new_token)
            else:
                tagged_tokens.append(tokens_list[i][j])
        tagged_tokens_list.append(tagged_tokens)

    return tagged_tokens_list

In [339]:
def tagging_connective_tokens_relations(tokens_list, id_lists):
    tagged_tokens_list = []
    for i, id_list in enumerate(id_lists):
        tagged_tokens = []
        for j, id_ in enumerate(id_list):
            if id_ != 0:
                new_token = "[" + tokens_list[i][j] + "]"
                print(id_)
                relation = next((key for key, value in label_map.items() if value == id_), None)
                relation_token = "(" + relation + ")"
                tagged_tokens.append(new_token)
                tagged_tokens.append(relation_token)

            else:
                tagged_tokens.append(tokens_list[i][j])
        tagged_tokens_list.append(tagged_tokens)

    return tagged_tokens_list

In [427]:
# for Connective detection
train_tagged_tokens = tagging_connective_tokens(train_tokens_sentences, train_ids_sentences)
test_tagged_tokens = tagging_connective_tokens(test_tokens_sentences, test_ids_sentences)
dev_tagged_tokens = tagging_connective_tokens(dev_tokens_sentences, dev_ids_sentences)

In [240]:
# Convert tokens and ids to JSON lines files
output_file = 'train_connective_detection_tagging.jsonl'
convert_to_jsonl(train_tokens_sentences, train_tagged_tokens, output_file)

output_file = 'test_connective_detection_tagging.jsonl'
convert_to_jsonl(test_tokens_sentences, test_tagged_tokens, output_file)

output_file = 'dev_connective_detection_tagging.jsonl'
convert_to_jsonl(dev_tokens_sentences, dev_tagged_tokens, output_file)