In [23]:
token_width = 20
tag_width = 10
lang_width = 10

def parse_cupt(file_path):
    sentences = []
    sentence = []
    
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("#") or line == "":
                if line == "" and sentence:
                    sentences.append(sentence)
                    sentence = []
                continue
            columns = line.split('\t')
            sentence.append(columns)
    
    if sentence:
        sentences.append(sentence)
    
    return sentences

def tranform_cupt_to_tsv(input_path, output_path, lang):
    
    cupt_data = parse_cupt(input_path)

    with open(output_path, "w", encoding="utf-8") as file:
        for sentence in cupt_data:
            switch = False
            for token in sentence:
                if token[1] == "_":
                    continue
                file.write(token[1] + "\t")
                if token[-1] =="*":
                    file.write("O\t")
                elif token[-1] != "O" and switch == False:
                    file.write("B-IDIOM\t")
                    switch = True
                else:
                    file.write("I-IDIOM\t")
                file.write(lang + "\n")


In [24]:
cupt_data = parse_cupt(r"./sharedtask-1.1-TR/train.cupt")
for token in cupt_data[0]:
    print(token)

['1', 'Şu', 'şu', 'Det', '_', '_', '2', 'DETERMINER', '_', '_', '*']
['2', 'anda', 'an', 'Noun', '_', 'A3sg|Loc|Pnon', '12', 'MODIFIER', '_', '_', '*']
['3', 'bir', 'bir', 'Adj', 'Num', '_', '5', 'DETERMINER', '_', '_', '*']
['4', 'tek', 'tek', 'Adj', '_', '_', '5', 'MODIFIER', '_', '_', '*']
['5', 'grup', 'grup', 'Noun', '_', 'A3sg|Nom|Pnon', '12', 'MODIFIER', '_', '_', '*']
['6', 'KOBİ', 'Kobi', 'Noun', 'Prop', 'A3sg|Nom|Pnon', '9', 'POSSESSOR', '_', '_', '*']
['7', "'lere", "'lere", '?', '_', '_', '8', 'ARGUMENT', '_', '_', '*']
['8', 'yönelik', 'yönelik', 'Postp', 'PCDat', '_', '9', 'MODIFIER', '_', '_', '*']
['9', 'faaliyetlerini', 'faaliyet', 'Noun', '_', 'A3pl|Acc|P3pl', '10', 'OBJECT', '_', '_', '*']
['10', '_', 'artır', 'Verb', '_', 'Pos', '11', 'DERIV', '_', '_', '*']
['11', 'artırmaya', '_', 'Noun', 'Inf2', 'A3sg|Dat|Pnon', '12', 'MODIFIER', '_', '_', '*']
['12', 'çalışıyor', 'çalış', 'Verb', '_', 'A3sg|Pos|Prog1', '0', 'PREDICATE', '_', '_', '*']
['13', ':', ':', 'Punc', '_

In [25]:
for token in cupt_data[1]:
    print(token)

['1', 'Türkiye', 'Türkiye', 'Noun', 'Prop', 'A3sg|Nom|Pnon', '17', 'SUBJECT', '_', '_', '*']
['2', "'de", "'de", '?', '_', '_', '3', 'MODIFIER', '_', '_', '*']
['3', 'bankaların', 'banka', 'Noun', '_', 'A3pl|Gen|Pnon', '6', 'POSSESSOR', '_', '_', '*']
['4', 'bir', 'bir', 'Adj', 'Num', '_', '6', 'DETERMINER', '_', '_', '*']
['5', 'siyasi', 'siyasi', 'Adj', '_', '_', '6', 'MODIFIER', '_', '_', '*']
['6', 'gücü', 'güç', 'Noun', 'NAdj', 'A3sg|Nom|P3sg', '8', 'SUBJECT', '_', '_', '*']
['7', 'de', 'de', 'Conj', '_', '_', '6', 'INTENSIFIER', '_', '_', '*']
['8', 'var', 'var', 'Adj', '_', '_', '17', 'COORDINATION', '_', '_', '*']
['9', ',', ',', 'Punc', '_', '_', '8', 'PUNCTUATION', '_', '_', '*']
['10', 'bu', 'bu', 'Det', '_', '_', '11', 'DETERMINER', '_', '_', '*']
['11', 'nedenle', 'neden', 'Noun', '_', 'A3sg|Ins|Pnon', '13', 'MODIFIER', '_', '_', '*']
['12', 'ilerde', 'ilerd', 'Noun', '_', 'A3sg|Dat|Pnon', '13', 'MODIFIER', '_', '_', '*']
['13', '_', 'örgütle', 'Verb', '_', 'Pass|Pos', '14

In [26]:
import os

# Ensure output folders exist
os.makedirs(r"./transformed-TR", exist_ok=True)
os.makedirs(r"./transformed-IT", exist_ok=True)

# Transform Turkish
tranform_cupt_to_tsv(r"./sharedtask-1.1-TR/train.cupt", r"./transformed-TR/train.tsv","tr")
tranform_cupt_to_tsv(r"./sharedtask-1.1-TR/dev.cupt", r"./transformed-TR/dev.tsv","tr")
tranform_cupt_to_tsv(r"./sharedtask-1.1-TR/test.cupt", r"./transformed-TR/test.tsv","tr")

# Transform Italian
tranform_cupt_to_tsv(r"./sharedtask-1.1-IT/train.cupt", r"./transformed-IT/train.tsv","it")
tranform_cupt_to_tsv(r"./sharedtask-1.1-IT/dev.cupt", r"./transformed-IT/dev.tsv","it")
tranform_cupt_to_tsv(r"./sharedtask-1.1-IT/test.cupt", r"./transformed-IT/test.tsv","it")


In [None]:
def combine_tsv_files(tr_path, it_path, output_path):
    with open(tr_path, "r", encoding="utf-8") as f_tr, \
         open(it_path, "r", encoding="utf-8") as f_it, \
         open(output_path, "w", encoding="utf-8") as f_out:

        tr_lines = f_tr.readlines()
        it_lines = f_it.readlines()
        f_out.writelines(tr_lines + it_lines)

# Create output folder
os.makedirs("./combined-tsv", exist_ok=True)

# Combine train, dev, test TSVs
combine_tsv_files("./transformed-TR/train.tsv", "./transformed-IT/train.tsv", r"../resources/PARSAME/train.tsv")
combine_tsv_files("./transformed-TR/dev.tsv", "./transformed-IT/dev.tsv", r"../resources/PARSAME/dev.tsv")
combine_tsv_files("./transformed-TR/test.tsv", "./transformed-IT/test.tsv", r"../resources/PARSAME/test.tsv")