In [1]:
import json
import pandas as pd
from collections import defaultdict

# Cargar el archivo JSON
with open("../data_extraction/json/isc_vocabulary_verbs.json", "r") as file:
    vocabulary = json.load(file)

In [2]:
# Función para generar conjugaciones en base a las reglas gramaticales
def generate_conjugations(verb, base_form):
    conjugations = [
        # ASPECT
        (verb, f"{base_form}a", "V;PFV"),  # Perfectivo pasado indicativo transitivo
        (verb, f"{base_form}i", "V;IPFV"),  # Imperfectivo presente indicativo intransitivo
        (verb, f"{base_form}mis", "V;HAB_A"),  # Habitual pasado remoto indicativo
        (verb, f"{base_form}misa", "V;PFV;HAB_A"),  # Habitual pasado remoto indicativo
        (verb, f"{base_form}isi", "V;IPFV;PROG"),  # Progresivo presente indicativo
        # TENSE
        (verb, f"{base_form}ri", "V;PST.yet"),  # pasado reciente indicativo (con sufijo ‘-ri’ para 'todavía')
        (verb, f"{base_form}bai", "V;PST.DUR"),  # 
        (verb, f"{base_form}yantana", "V;PFV;PST.yesterday"),  # 
        (verb, f"{base_form}shina", "V;PST.days"),  # Pasado específico para días recientes
        (verb, f"{base_form}paoni", "V;PST.REM"),  # Pasado remoto indicativo
        (verb, f"{base_form}nonkota", "V;FUT.INM"),  # Futuro inmediato indicativo
        (verb, f"{base_form}i kai", "V;IPFV;FUT.INM"),  # Futuro inmediato indicativo
        # MOOD
        (verb, f"{base_form}bira", "V;PFV;DUB"),  # Dubitativo en presente
        (verb, f"{base_form}birai", "V;IPFV;DUB"),  # Dubitativo en presente
        (verb, f"{base_form}koinsi", "V;ASER"),  # Asertivo presente
        (verb, f"{base_form}anaka", "V;EST;HAB_M"),  # Habilitivo presente
        (verb, f"{base_form}katsi iki", "V;DES"),  # Desiderativo presente
        # NUMBER
        (verb, f"{base_form}rabea", "V;PFV;DUAL"),  #
        (verb, f"{base_form}rabei", "V;IPFV;DUAL"),  # 
        (verb, f"{base_form}botana", "V;EST;PLU"),  # 
        # NEGATION
        (verb, f"{base_form}ma", "V;PFV;NEG"),  # Negativo en presente indicativo
        (verb, f"{base_form}mai", "V;IPFV;NEG"),  # Negativo en presente indicativo
        (verb, f"{base_form}mashina", "V;PST.days;NEG"),  # Negativo en presente indicativo
        (verb, f"{base_form}yohma", "V;NEG.never"),  # Negación enfática (nunca) en presente
        (verb, f"{base_form}kean", "V;NEG.FRUS"),  # Frustrativo en presente indicativo
        # MOVEMENT
        (verb, f"{base_form}berana", "V;TR;PFV;MOV.VEN"),  # ‘venitivo’
        (verb, f"{base_form}kerana", "V;INTR;PFV;MOV.VEN"),  # ‘venitivo’
        (verb, f"{base_form}baina", "V;TR;PFV;MOV.ANDA"),  # ‘andativo’
        (verb, f"{base_form}kaina", "V;INTR;PFV;MOV.ANDA"),  # ‘andativo’
        (verb, f"{base_form}bokona", "V;TR;PFV;MOV.around"),  # dando la vuelta
        (verb, f"{base_form}akea", "V;INTR;PFV;MOV.around"),  # dando la vuelta
        (verb, f"{base_form}akea", "V;PFV;MOV.passing"),  # ‘pasando’
        (verb, f"{base_form}pakea", "V;PFV;MOV.down"),  # ‘bajando’
        (verb, f"{base_form}ina", "V;PFV;MOV.up"),  # ‘subiendo’
        (verb, f"{base_form}tana", "V;PFV;MOV.goto"),  # ir a
        # OTHER
        (verb, f"{base_form}hona", "V;PFV;BEN"), # Benefactivo
        (verb, f"{base_form}arana", "V;PFV;MAL"), # Malefactivo
        (verb, f"{base_form}makarana", "V;PFV;CAU;MAL"), # Causativo + Malefactivo
        (verb, f"{base_form}mai", "V;IPFV;CAU"), # Causativo
        (verb, f"{base_form}ma", "V;PFV;CAU"), # Causativo
        (verb, f"{base_form}hakoa", "V;PFV;DIM"), # Diminutivo
        (verb, f"{base_form}hekoi", "V;IPFV;DIM"), # Diminutivo
        (verb, f"{base_form}aranhakoa", "V;PFV;MAL;DIM"), # Malefactivo + Diminutivo
        (verb, f"{base_form}hekoni", "V;PST.REM;DIM"), # Diminutivo + Pasado remoto      
    ]
    return conjugations


In [3]:
# Filtrar vocab Vocabulary
vocabulary = pd.DataFrame(vocabulary)   

# filtrar aquellos verbos que tioenen mas de una palabra
print(vocabulary.shape)
vocabulary = vocabulary[vocabulary['verb'].str.contains(' ')==False]
print(vocabulary.shape)


(485, 4)
(319, 4)


In [4]:
# Read the real inflections
real_inflections = pd.read_csv("unimorph_output.tsv", sep="\t", header=None)
real_inflections.columns = ["Root", "InflectedForm", "MorphologicalTags"]

# Create a mapping from root to set of (inflected form, tags)
real_inflections_dict = defaultdict(set)
for idx, row in real_inflections.iterrows():
    root = row["Root"]
    inflected_form = row["InflectedForm"]
    tags = row["MorphologicalTags"]
    real_inflections_dict[root].add((inflected_form, tags))

# Initialize lists to store the generated data
generated_unknown_verbs = []  # Inflections for verbs not in real inflections
generated_new_forms = []      # Inflections for known verbs but with new characteristics

# Generate data in Unimorph format
for index, row in vocabulary.iterrows():
    verb = row["verb"]
    base_form = verb[:-1] if len(verb) > 2 and verb.endswith(('a', 'i')) else verb  # Extract the root of the verb
    conjugations = generate_conjugations(verb, base_form)
    # Replace "ii" with "i" and "aa" with "a" in the conjugations
    conjugations = [(root, form.replace("ii", "i").replace("aa", "a").replace("kk", "k")
                     .replace("nn", "n").replace("rr", "r").replace("mm", "m")
                     .replace("hh", "h").replace("tt", "t").replace("ss", "s"), tags) for root, form, tags in conjugations]
    for root, form, tags in conjugations:
        if root not in real_inflections_dict:
            generated_unknown_verbs.append((root, form, tags))
        else:
            if (form, tags) not in real_inflections_dict[root]:
                generated_new_forms.append((root, form, tags))

# Convert the lists to DataFrames and save to files
df_unknown_verbs = pd.DataFrame(generated_unknown_verbs, columns=["Root", "InflectedForm", "MorphologicalTags"])
df_unknown_verbs.to_csv("synthetic/generated_unknown_verbs.tsv", sep="\t", index=False, header=False)

df_new_forms = pd.DataFrame(generated_new_forms, columns=["Root", "InflectedForm", "MorphologicalTags"])
df_new_forms.to_csv("synthetic/generated_new_forms.tsv", sep="\t", index=False, header=False)

print("Generated inflections for unknown verbs saved to 'synthetic/generated_unknown_verbs.tsv'.")
print("Generated inflections for known verbs with new characteristics saved to 'synthetic/generated_new_forms.tsv'.")

Generated inflections for unknown verbs saved to 'synthetic/generated_unknown_verbs.tsv'.
Generated inflections for known verbs with new characteristics saved to 'synthetic/generated_new_forms.tsv'.


In [5]:
# Cargar los datos de los archivos existentes
real_data = pd.read_csv("unimorph_output.tsv", sep="\t", header=None)
real_data.columns = ["Root", "InflectedForm", "MorphologicalTags"]

synthetic_known_data = pd.read_csv("synthetic/generated_new_forms.tsv", sep="\t", header=None)
synthetic_known_data.columns = ["Root", "InflectedForm", "MorphologicalTags"]

synthetic_unknown_data = pd.read_csv("synthetic/generated_unknown_verbs.tsv", sep="\t", header=None)
synthetic_unknown_data.columns = ["Root", "InflectedForm", "MorphologicalTags"]

# 1. Solo datos reales
real_data.to_csv("combinations/real_only.tsv", sep="\t", index=False, header=False)
print("Archivo 'combinations/real_only.tsv' generado con solo datos reales.")

# 2. Datos reales + datos sintéticos (conocidos y desconocidos)
real_and_synthetic_data = pd.concat([real_data, synthetic_known_data, synthetic_unknown_data], ignore_index=True)
real_and_synthetic_data.to_csv("combinations/real_and_synthetic.tsv", sep="\t", index=False, header=False)
print("Archivo 'combinations/real_and_synthetic.tsv' generado con datos reales y sintéticos.")

# 3. Solo datos sintéticos (conocidos y desconocidos)
synthetic_only_data = pd.concat([synthetic_known_data, synthetic_unknown_data], ignore_index=True)
synthetic_only_data.to_csv("combinations/synthetic_only.tsv", sep="\t", index=False, header=False)
print("Archivo 'combinations/synthetic_only.tsv' generado con solo datos sintéticos.")


Archivo 'combinations/real_only.tsv' generado con solo datos reales.
Archivo 'combinations/real_and_synthetic.tsv' generado con datos reales y sintéticos.
Archivo 'combinations/synthetic_only.tsv' generado con solo datos sintéticos.
