In [1]:
# %pip install PyPDF2
# %pip install pdfplumber

In [2]:
import re
import PyPDF2
import pandas as pd
import pdfplumber
from collections import defaultdict
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Configurar opciones de visualización de pandas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Funciones

In [4]:
## Open PDF file
def open_pdf(file_path):
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        content = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text()
            if page_text:
                # Add custom logic here to add line breaks if necessary
                content += page_text + "\n"
        return content

# Extraer Verbos del Vocabulario

In [5]:

# Open the PDF file
with pdfplumber.open('../pdf/vocabulario_iskonawa.pdf') as pdf:
    content_plumber = ""
    for page in pdf.pages:
        # Get page dimensions
        width = page.width
        height = page.height

        # Define the column boundaries (adjust these values based on page dimensions)
        left_column = (0, 0, width / 2, height)  # Example for left half of the page
        right_column = (width / 2, 0, width, height)  # Example for right half of the page

        def extract_column_text(page, column_bounds):
            left, top, right, bottom = column_bounds
            if right <= width and bottom <= height:
                return page.within_bbox((left, top, right, bottom)).extract_text()
            else:
                return ""

        # Extract text from both columns
        left_text = extract_column_text(page, left_column)
        right_text = extract_column_text(page, right_column)

        # Combine and add to content
        content_plumber += left_text + "\n" + right_text + "\n"  # Adjust separator if needed

    # print(content_plumber)


In [6]:
content_plumber = re.sub(r'\s*\b\d{2,3}\b\s*', '\n', content_plumber)

In [7]:
# Función para dividir la traducción en español e inglés
def split_translation(category, translation):
    # Eliminar el numero entre paréntesis al final de la categoría
    category = re.sub(r'\(\d+\)', '', category)
    # Dividir la traducción usando la categoría como referencia
    parts = translation.split(f"{category} ")
    spanish_translation = parts[0].strip()
    english_translation = parts[1].strip() if len(parts) > 1 else ""
    return spanish_translation, english_translation

# Funcion para poner en Mayuscula la primera letra despues de un punto
def capitalize_after_period(text):
    # Encuentra todos los puntos y la letra siguiente
    def capitalize_match(match):
        return match.group(1) + match.group(2).upper()
    # Usar una expresión regular para encontrar los puntos seguidos de una letra
    capitalized_text = re.sub(r'([.!?]\s+)(\w)', capitalize_match, text)
    return capitalized_text

def extract_entries(text):
    # Expresión regular para encontrar las entradas del vocabulario y sus categorías
    pattern = r'\b([a-záéíóúñA-ZÁÉÍÓÚÑ][^\n]*?)\s+(pron\.|v\.|V\.|\(.\) v\.|\(.\) V\.|adj\.|n\.|N\.|\(.\) n\.|\(.\) N\.|num\.|adv\.|int\.)'

    # Buscar todas las coincidencias en el texto
    matches = list(re.finditer(pattern, text))

    # Filtrar entradas que tienen puntuación extra al final
    filtered_matches = [match for match in matches if not re.search(r'[^\w\s]', match.group(1).strip()[-1])]

    # Crear un diccionario para almacenar las traducciones asociadas a las entradas
    vocabulary_entries = defaultdict(lambda: {'category': [], 'spanish_meaning': [], 'english_meaning': []})
    for i, match in enumerate(filtered_matches):
        entry, category = match.groups()
        entry = entry.strip()  # Limpiar espacios extra
        start_pos = match.end()  # Posición justo después de la entrada

        # Determinar la posición de la siguiente entrada o el final del texto
        if i < len(filtered_matches) - 1:
            next_start_pos = filtered_matches[i + 1].start()
        else:
            next_start_pos = len(text)

        # Extraer la traducción desde la posición de la entrada hasta la siguiente entrada o el final del texto
        translation = text[start_pos:next_start_pos].strip()
        translation = re.sub(r'\s+', ' ', translation)  # Eliminar espacios adicionales

        # Guardar en el diccionario
        if entry and category:
            spanish_translation, english_translation = split_translation(category, translation)
            vocabulary_entries[entry]['category'].append(category)
            vocabulary_entries[entry]['spanish_meaning'].append(capitalize_after_period(spanish_translation))
            vocabulary_entries[entry]['english_meaning'].append(capitalize_after_period(english_translation))

    # Convertir el diccionario en una lista de diccionarios
    data = []
    for entry, details in vocabulary_entries.items():
        data.append({
            'verb': entry,
            'category': details['category'],
            'spanish_meaning': details['spanish_meaning'],
            'english_meaning': details['english_meaning']
        })

    return data


data = extract_entries(content_plumber)
df = pd.DataFrame(data)

# Imprimir el DataFrame
print(df.shape)

(1519, 4)


In [8]:
df_verbs = df.copy()

# Filtrar filas donde 'category' contiene elementos que coinciden con el patrón
df_verbs = df_verbs[df_verbs['category'].apply(lambda categories: any(re.search(r'^\(?\d*\)?\s*v\.$', category) for category in categories))]

# Elimnar spanish_meaning y english_meaning si category no cumple con el patrón r'^\(?\d*\)?\s*v\.$'
def filter_meanings(categories, spanish_meanings, english_meanings):
    new_categories = []
    new_spanish_meanings = []
    new_english_meanings = []
    for category, spanish_meaning, english_meaning in zip(categories, spanish_meanings, english_meanings):
        if re.search(r'^\(?\d*\)?\s*v\.$', category):
            new_categories.append(category)
            new_spanish_meanings.append(spanish_meaning)
            new_english_meanings.append(english_meaning)
    return new_categories, new_spanish_meanings, new_english_meanings


# Aplicar la función filter_meanings
df_verbs[['category', 'spanish_meaning', 'english_meaning']] = df_verbs.apply(lambda row: filter_meanings(row['category'], row['spanish_meaning'], row['english_meaning']), axis=1, result_type='expand')

# Eliminar la columna 'category'
df_verbs = df_verbs.drop(columns=['category'])

print(df_verbs.shape)

(480, 3)


In [9]:
# Filtrar filas con english_meaning vacío
df_with_empty_english_meaning = df_verbs[df_verbs['english_meaning'].apply(lambda meanings: all(meaning == '' for meaning in meanings))]
print(df_with_empty_english_meaning.shape)

# Filtrar filas con spanish_meaning vacío
df_with_empty_spanish_meaning = df_verbs[df_verbs['spanish_meaning'].apply(lambda meanings: all(meaning == '' for meaning in meanings))]
print(df_with_empty_spanish_meaning.shape)

# Concatenar los DataFrames con significados vacíos
df_with_empties = pd.concat([df_with_empty_english_meaning, df_with_empty_spanish_meaning])

df_with_empties

(15, 3)
(0, 3)


Unnamed: 0,verb,spanish_meaning,english_meaning
231,bohka iki,[cortarse uno mismo el cerquillo. V. To cut one’s own bangs.],[]
298,chihbini,[ventana. N. Window.],[]
378,habashkipakei,[tornarse un poco oscuro. To become a little dark],[]
399,hanka hanka iki,[fatigarse o agitarse debido a un esfuerzo físico muy prolongado. N. To become fatigued or agitated due to a very prolonged physical effort.],[]
547,iskan okoin,[sudar. V. To sweat.],[]
605,kapói,"[preparar una sopa espesa, conocida como mazamorra en el castellano regional. N. To prepare a thick soup, known as mazamorra (milky maize pudding) in the regional Castilian.]",[]
647,kesai,[mentir. N. To lie.],[]
762,maspoi,"[cubrirse con algo, por ejemplo, con una colcha. N. To cover oneself with something, for example, with a quilt.]",[]
829,mewin,[mezclar líquidos. N. To mix liquids.],[]
970,nokoi,[llegar a un lugar. N. To arrive at a place.],[]


In [10]:
# Eliminar las filas que no tienen verbos válidos
df_verbs = df_verbs[~df_verbs['verb'].isin(['chihbini', 'tsobe'])]

# Crear una máscara para seleccionar solo las filas donde 'english_meaning' está vacío y 'spanish_meaning' contiene ' N. ', ' V. ' o ' V, '
mask = df_verbs.apply(lambda row: all(meaning == '' for meaning in row['english_meaning']) and any(any(substring in meaning for substring in [' N. ', ' V. ', ' V, ']) for meaning in row['spanish_meaning']), axis=1)

# Mostrar las filas seleccionadas
df_verbs.loc[mask, ['verb', 'spanish_meaning', 'english_meaning']]

# Dividir 'spanish_meaning' en dos columnas 'spanish_meaning' y 'english_meaning' solo para las filas seleccionadas
split_columns = df_verbs.loc[mask, 'spanish_meaning'].apply(lambda meanings: [re.split(r' N\. | V\. | V, ', meaning) for meaning in meanings])

# Asignar las columnas divididas de vuelta a las filas seleccionadas en el DataFrame original
df_verbs.loc[mask, 'spanish_meaning'] = split_columns.apply(lambda x: [item[0] for item in x])
df_verbs.loc[mask, 'english_meaning'] = split_columns.apply(lambda x: [item[1] if len(item) > 1 else '' for item in x])
print(df_verbs.shape)

(478, 3)


In [11]:
# Filtrar filas con english_meaning o spanish_meaning vacío
df_with_empty_english_meaning = df_verbs[df_verbs['english_meaning'].apply(lambda meanings: all(meaning == '' for meaning in meanings))]
df_with_empty_spanish_meaning = df_verbs[df_verbs['spanish_meaning'].apply(lambda meanings: all(meaning == '' for meaning in meanings))]
df_with_empties = pd.concat([df_with_empty_english_meaning, df_with_empty_spanish_meaning])

df_with_empties

Unnamed: 0,verb,spanish_meaning,english_meaning
378,habashkipakei,[tornarse un poco oscuro. To become a little dark],[]
1131,defecar,[to defecate.],[]


In [12]:
# Eliminar las filas con verb = 'habashkipakei' o 'defecar'
verbs_to_remove = ['habashkipakei', 'defecar', 'poi', 'toai', "tewe tewe ik"]

df_verbs = df_verbs[~df_verbs['verb'].isin(verbs_to_remove)]

# Add new entries
new_entries = [
    {
        "verb": "poi",
        "spanish_meaning": ["defecar."],
        "english_meaning": ["to defecate."]
    },
    {
        "verb": "habashkipakei",
        "spanish_meaning": ["tornarse un poco oscuro."],
        "english_meaning": ["To become a little dark."]
    },
    {
        "verb": "toai",
        "spanish_meaning": [
            "cargar (a un niño o a un animal); en el castellano regional se dice amarcar.",
            "cernir.",
            "estar embarazada una mujer."
        ],
        "english_meaning": [
            "To carry (a child or an animal); in regional Castilian it is called amarcar.",
            "to sift.",
            "for a woman to become pregnant. Toa aki"
        ]
    },
    {
        "verb": "tewe tewe iki",
        "spanish_meaning": [
            "excitarse, despertarse el deseo sexual en una persona."
        ],
        "english_meaning": [
            "to become aroused, to wake up the sexual desire in a person."
        ],
    },
]

new_entries_df = pd.DataFrame(new_entries)
df_verbs = pd.concat([df_verbs, new_entries_df], ignore_index=True)

In [13]:
# Modificar las oraciones que contienen una palabra junto a un número entre paréntesis al final del punto en 'spanish_meaning'
df_verbs['english_meaning'] = df_verbs['english_meaning'].apply(lambda meanings: [re.sub(r'\.\s\w+(\s\w+)?+(\s\w+)?\s*(\(\d+\))?$', '', meaning) for meaning in meanings])

# Modify some entries
df_verbs.loc[df_verbs['verb'] == 'chibain', 'english_meaning'] = [["to chase or to follow the trail of a person or an animal."]]
df_verbs.loc[df_verbs['verb'] == 'hená matsi', 'english_meaning'] = [["to put out (the fire)"]]
df_verbs.loc[df_verbs['verb'] == 'nawin', 'english_meaning'] = [["to eat meat accompanied by something like yucca, potato, banana, or rice."]]
df_verbs.loc[df_verbs['verb'] == 'noroh aki', 'english_meaning'] = [["to slurp, producing a noise with the throat."]]
df_verbs.loc[df_verbs['verb'] == 'pen pen aki', 'english_meaning'] = [["for the floor to make noise upon a stick or similar object hitting it, such as those traditionally made for some iskonawa dances."]]
df_verbs.loc[df_verbs['verb'] == 'rakai', 'english_meaning'] = [["to lie on one’s back."]]
df_verbs.loc[df_verbs['verb'] == 'tahka tahka iki', 'english_meaning'] = [["to shake off; for a serpent to jingle."]]
df_verbs.loc[df_verbs['verb'] == 'ten ten iki', 'english_meaning'] = [["for an injury to hurt, producing stabbing pains."]]
df_verbs.loc[df_verbs['verb'] == 'toponki', 'english_meaning'] = [["to count, to enumerate."]]
df_verbs.loc[df_verbs['verb'] == 'warepakei', 'english_meaning'] = [["to rip articles of clothing or paper various times."]]

In [14]:
# Sort DataFrame by 'verb' column
df_verbs = df_verbs.sort_values('verb').reset_index(drop=True)

# Add refenrece column
df_verbs['reference'] = '17375911/ZFR5PFEG'

print(df_verbs.shape)
# Guardar el DataFrame en un archivo CSV
df_verbs.to_json('../json/isc_vocabulary_verbs.json', orient='records', force_ascii=False)
df.to_json('../json/isc_vocabulary.json', orient='records', force_ascii=False)

(477, 4)


# Extraer Ejemplos

In [15]:
# Open the PDF file
content_bosquejo_gramatical = open_pdf('../pdf/Bosquejo-gramatical.pdf')
content_grammatical_relations_iskonawa = open_pdf('../pdf/2015_grammatical_relations_iskonawa.pdf')
content_prefijos_iskonawa = open_pdf('../pdf/2015_prefijos_iskonawa.pdf')
content_canciones_iskonawa = open_pdf('../pdf/2019_canciones_iskonawa.pdf')

In [16]:
def extract_verbs(content):
    # Definir el máximo número de caracteres permitido por columna
    max_length = 500

    # Parte 1: Inicio de la línea y captura del número entre paréntesis
    inicio_linea_y_numero = r'^\s*\((\d+)\)\s+'
    # Parte 2: Captura de la oración en Iskonawa hasta el primer salto de línea
    iskonawa_sentence = rf'(.{{1,{max_length}}}?)\s*\n'
    # Parte 3: Captura de la primera línea de sufijos
    primera_linea_sufijos = rf'\s*(.{{1,{max_length}}}?)\s*\n'
    # Parte 4: Captura de la segunda línea de significados
    segunda_linea_significados = rf'\s*(.{{1,{max_length}}}?)\s*\n'
    # Parte 5: Captura del significado en español entre comillas
    significado_espanol = rf'\s*[‘\']\s*(.{{1,{max_length}}}?)\s*[’\']'

    # Combinar todas las partes en el regex final
    regex = (
        inicio_linea_y_numero +
        iskonawa_sentence +
        primera_linea_sufijos +
        segunda_linea_significados +
        significado_espanol
    )
    # Diccionario para almacenar las entradas numeradas
    entradas_dict = {}

    # Encontrar todas las coincidencias con sus posiciones
    matches = re.finditer(regex, content, re.MULTILINE | re.DOTALL)

    # Guardar cada coincidencia en el diccionario junto con la posición inicial
    for match in matches:
        numero = match.group(1)
        iskonawa_sentence = match.group(2).strip()
        suffix_sentence = match.group(3).strip()
        annotated_sentence = match.group(4).strip()
        spanish_sentence = match.group(5).strip()
        start_position = match.start(2)  # Posición donde comienza iskonawa_sentence
        num_lineas_sufijos = suffix_sentence.count('\n') + 1
        num_lineas_significado = annotated_sentence.count('\n') + 1
        num_lineas_spanish_sentence = spanish_sentence.count('\n') + 1

        # Si annotated_sentence tiene 3 líneas, mover la segunda línea a suffix_sentence
        if num_lineas_significado == 3:
            annotated_sentence_split = annotated_sentence.split('\n')
            suffix_sentence += ' ' + annotated_sentence_split[1].strip()
            annotated_sentence = f'{annotated_sentence_split[0]} {annotated_sentence_split[2]}'
            num_lineas_significado = 1  # Ajustar el número de líneas

        # En suffix_sentence si hay un espacio seguido de un guion, se debe eliminar el espacio
        # Ejemplo: sachavaca flaco Germán -ERG matar -PERF -> sachavaca flaco Germán-ERG matar-PERF
        suffix_sentence = re.sub(r'\s+-', '-', suffix_sentence)
        annotated_sentence = re.sub(r'\s+-', '-', annotated_sentence)


        entradas_dict[numero] = {
            'index': numero,
            'iskonawa_sentence': iskonawa_sentence,
            'suffix_sentence': suffix_sentence,
            'annotated_sentence': annotated_sentence,
            'spanish_sentence': spanish_sentence,
            'start_position': start_position,
            'num_lineas_sufijos': num_lineas_sufijos,
            'num_lineas_significado': num_lineas_significado,
            'num_lineas_spanish_sentence': num_lineas_spanish_sentence        
        }

        # Contar numero de lineas en annotated_sentence
    return entradas_dict

In [17]:
def extract_verbs_v2(content):

    # reemplazar „ por ‘ y ‟ por ’
    content = content.replace('„', '‘').replace('‟', '’')

    # Definir el máximo número de caracteres permitido por columna
    max_length = 500

    # Parte 1: Inicio de la línea y captura del número entre paréntesis
    inicio_linea_y_numero = r'^\s*\((\d+[a-zA-Z]?)\)\s+'
    # Parte 2: Captura de la oración en Iskonawa hasta el primer salto de línea
    primera_linea_sufijos = rf'(.{{1,{max_length}}}?)\s*\n'
    # Parte 4: Captura de la segunda línea de significados
    segunda_linea_significados = rf'\s*(.{{1,{max_length}}}?)\s*\n'
    # Parte 5: Captura del significado en español entre comillas
    significado_espanol = rf'\s*[‘\']\s*(.{{1,{max_length}}}?)\s*[’\']'

    # Combinar todas las partes en el regex final
    regex = (
        inicio_linea_y_numero +
        primera_linea_sufijos +
        segunda_linea_significados +
        significado_espanol
    )
    # Diccionario para almacenar las entradas numeradas
    entradas_dict = {}

    # Encontrar todas las coincidencias con sus posiciones
    matches = re.finditer(regex, content, re.MULTILINE | re.DOTALL)

    # Guardar cada coincidencia en el diccionario junto con la posición inicial
    for match in matches:
        numero = match.group(1)
        iskonawa_sentence = match.group(2).strip()
        suffix_sentence = match.group(2).strip()
        annotated_sentence = match.group(3).strip()
        spanish_sentence = match.group(4).strip()
        start_position = match.start(1)  # Posición donde comienza iskonawa_sentence
        num_lineas_sufijos = suffix_sentence.count('\n') + 1
        num_lineas_significado = annotated_sentence.count('\n') + 1
        num_lineas_spanish_sentence = spanish_sentence.count('\n') + 1

        # Si annotated_sentence tiene 3 líneas, mover la segunda línea a suffix_sentence
        if num_lineas_significado == 3:
            annotated_sentence_split = annotated_sentence.split('\n')
            suffix_sentence += ' ' + annotated_sentence_split[1].strip()
            annotated_sentence = f'{annotated_sentence_split[0]} {annotated_sentence_split[2]}'
            num_lineas_significado = 1  # Ajustar el número de líneas

        # En suffix_sentence si hay un espacio seguido de un guion, se debe eliminar el espacio
        # Ejemplo: sachavaca flaco Germán -ERG matar -PERF -> sachavaca flaco Germán-ERG matar-PERF
        suffix_sentence = re.sub(r'\s+-', '-', suffix_sentence)
        annotated_sentence = re.sub(r'\s+-', '-', annotated_sentence)

        # iskonawa_sentence sera suffix_sentence sin los guiones
        iskonawa_sentence = re.sub(r'-', '', suffix_sentence)
        iskonawa_sentence = re.sub(r'–', '', iskonawa_sentence)
        iskonawa_sentence = re.sub(r'ø', '', iskonawa_sentence)
        spanish_sentence = re.sub(r'Y o', 'Yo', spanish_sentence)


        entradas_dict[numero] = {
            'index': numero,
            'iskonawa_sentence': iskonawa_sentence,
            'suffix_sentence': suffix_sentence,
            'annotated_sentence': annotated_sentence,
            'spanish_sentence': spanish_sentence,
            'start_position': start_position,
            'num_lineas_sufijos': num_lineas_sufijos,
            'num_lineas_significado': num_lineas_significado,
            'num_lineas_spanish_sentence': num_lineas_spanish_sentence        
        }

        # Contar numero de lineas en annotated_sentence
    return entradas_dict




In [18]:
entries_bosquejo_gramatical = extract_verbs(content_bosquejo_gramatical)
df_bgi = pd.DataFrame.from_dict(entries_bosquejo_gramatical, orient='index')
df_bgi['reference'] = '17375911/7XWQXF6G'
df_bgi = df_bgi.drop_duplicates(subset=['iskonawa_sentence'])
print(len(df_bgi))

244


In [19]:
entries_grammatical_relations_iskonawa = extract_verbs_v2(content_grammatical_relations_iskonawa)
df_gri = pd.DataFrame.from_dict(entries_grammatical_relations_iskonawa, orient='index')
df_gri['reference'] = '17375911/GSDUJ7ZH'
df_gri = df_gri.drop_duplicates(subset=['iskonawa_sentence'])
print(len(df_gri))

17


In [20]:
entries_prefijos_iskonawa = extract_verbs_v2(content_prefijos_iskonawa)
df_pi = pd.DataFrame.from_dict(entries_prefijos_iskonawa, orient='index')
df_pi['reference'] = '17375911/PJJRIJT4'
df_pi = df_pi.drop_duplicates(subset=['iskonawa_sentence'])
print(len(df_pi))

105


In [21]:
def clean_text(text):
    # Reemplazar caracteres no deseados
    lines = text.split('\n')
    lines = [line for line in lines if "Ah eh eh" not in line]
    text = '\n'.join(lines)
    text = re.sub(r'\((\d+[a-zA-Z]?)\)', r'\n(\1) ', text)
    text = re.sub(r'අඈർ', 'LOC ', text)
    text = re.sub(r'අඈർ', 'LOC ', text)
    text = re.sub(r'ർඈඉ', 'COP ', text)
    text = re.sub(r'ඌංൻ', 'SIB', text)
    text = re.sub(r'ඉඈඌඌ', 'POSS', text)
    text = re.sub(r'ർൺඎඌ-ංඉൿඏ', 'CAUS-IPFV', text)
    text = re.sub(r'ർൺඎඌ-ංආඉ', 'CAUS-IMP', text)
    text = re.sub(r' ංඉൿඏ', 'IPFV', text)
    text = re.sub(r'ඉൿඏ', 'PFV ', text)
    text = re.sub(r'ൾඏංൽ', 'EVID', text)
    text = re.sub(r'ඉඈඌඉ', 'POSP', text)
    text = re.sub(r'ඉඋඈඉ', 'PROP', text)
    text = re.sub(r'ඌ඀', 'SG', text)
    text = re.sub(r'ඈ', 'O ', text)
    text = re.sub(r'ඌ', 'S', text)
    text = re.sub(r'’', '’', text)
    return text

entries_canciones_iskonawa = clean_text(content_canciones_iskonawa)
entries_canciones_iskonawa = extract_verbs(entries_canciones_iskonawa)
df_ci = pd.DataFrame.from_dict(entries_canciones_iskonawa, orient='index')
df_ci['reference'] = '17375911/JYUAVIM5'
df_ci = df_ci.drop_duplicates(subset=['iskonawa_sentence'])
print(len(df_ci))


23


In [22]:
df_temp = pd.concat([df_bgi, df_gri, df_pi, df_ci])
print(df_temp.shape)

(389, 10)


### Clean data

In [23]:
df_errors = df_temp[(df_temp['num_lineas_significado'] > 1) | (df_temp['num_lineas_sufijos'] > 1) | (df_temp['num_lineas_spanish_sentence'] > 1)]
df_errors['index'] = df_errors.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_errors['index'] = df_errors.index


In [24]:
# Print df_errors nicely
def print_dataframe(df):
    for row in df.itertuples():
        print(f"Reference: {row.reference}")
        print(f"Index: {row.index}")
        print(f"Oración Iskonawa: {row.iskonawa_sentence}")
        print(f"Oración Sufijos: {row.suffix_sentence}")
        print(f"Oración Significado: {row.annotated_sentence}")
        print(f"Oración Español: {row.spanish_sentence}")
        print("-" * 50)

# print_dataframe(df_errors)

In [25]:
# rows to drop
rows_to_drop = ['48', '77', '78', '79', '87', '89', '106', '114', '165', '215', '249', '285','287']
new_df_bgi = df_bgi[~df_bgi.index.isin(rows_to_drop)]


rows_to_drop = ['1a']
new_df_gri = df_gri[~df_gri.index.isin(rows_to_drop)]
rows_to_drop = ['1a', '1b', '1c', '1d', '2a', '2b', '2c', '2d',
                '8a', '8b',
                '9a', '9b', '9c',
                '10a', '10b', '10c', '11a', '11b', '11c', '11d', '11e',
                '12a', '12b', '12c', '13', '14a', '14b', '14c',
                '16a', '16b', '16c', '20',
                '21a', '21b', '21c', '21d', '22', '23', '24', '28a', '28b', '28c', '32a']
new_df_pi = df_pi[~df_pi.index.isin(rows_to_drop)]


new_df = pd.concat([new_df_bgi, new_df_gri, new_df_pi, df_ci])

# Eliminar las columnas 'start_position', 'num_lineas_sufijos' y 'num_lineas_significado'
new_df = new_df.drop(columns=['start_position', 'num_lineas_sufijos', 'num_lineas_significado', 'num_lineas_spanish_sentence'])

#Eliminar cambio de linea en spanish_sentence
new_df['spanish_sentence'] = new_df['spanish_sentence'].str.replace('\n', ' ')

print(new_df.shape)

(342, 6)


In [26]:
data_file_path = 'manually_corrected_data.json'
data_df = pd.read_json(data_file_path)


for index, item in data_df.iterrows():
    new_df.loc[index, 'iskonawa_sentence'] = item["iskonawa_sentence"]
    new_df.loc[index, 'index'] = item["index"]
    new_df.loc[index, 'suffix_sentence'] = item["suffix_sentence"]
    new_df.loc[index, 'annotated_sentence'] = item["annotated_sentence"]
    new_df.loc[index, 'spanish_sentence'] = item["spanish_sentence"]
    new_df.loc[index, 'reference'] = item["reference"]


In [27]:
# Guardar el DataFrame en un archivo JSON
new_df.to_json('../json/isc_raw_extraction.json', orient='records', force_ascii=False)

## Correciones manuales

Se realizan correciones para solucionar:
- La mayoria de casos en los `iskonawa_sentence` y `suffix_sentence` no coinciden.
- Traducciones

In [28]:
new_df = pd.read_json('../json/isc_raw_extraction.json')

In [29]:
spanish_sentence_replacements = [
    (r"\(.*?\)", ""),
    (r"Germán se está volviendo flaco", "Germán está adelgazando"),
    (r"lavar se ", "lavarse "),
    (r"Yo voy duermo dentro del monte", "Yo duermo dentro del monte"),
    (r" o rinar", " orinar"),
    (r" s iempre", "  siempre"),
    (r"Germá n", "Germán"),

]


iskonawa_sentence_replacements = [
    (r"\(.*?\)", ""),
    (r"\. ", " "),
    (r" \.", ""),
    (r"\.", ""),
    (r"\*", ""),
    (r"\(.*?\)", ""),
    (r" o aka ", " oaka "),
    (r" piko ta ", " piko ta "),
    (r"ta nkara", "tankara"),
    (r"tankar a", "tankara"),
    (r"meach ia", "meachia"),
    (r"Meritani n", "Meritanin"),
    (r"Germa nin", "Germanin"),
    (r"piko ta", "pikota"),
    (r"aho n", "ahon"),
    (r"cho kikerana", "chokikerana"),
    (r"Po caripani", "Pocaripani"),
    (r"kah eranabi", "kaheranabi"),
    (r"abe rana", "aberana"),
    (r"mer ita", "merita"),
    (r"c arorinabetan", "carorinabetan"),
    (r"b akenpa", "bakenpa"),
    (r"rateheko i", "ratehekoi"),
    (r"Emenki ri", "Emenkiri"),
    (r"o nin", "onin"),
    (r"bo ankatsi", "boankatsi"),
    (r"yo ishika", "yoishika"),
    (r"rateheko ni", "ratehekoni"),
    (r"cho ki", "choki"),
    (r"o hakerana", "ohakerana"),
    (r"o hakaina", " ohakaina"),
    (r"ch oro", "choro"),
    (r"choro npakea", "choronpakea"),
    (r"ayayo hma", "ayayohma"),
    (r"ayo hma", "ayohma"),
    (r"bo anrana", "boanrana"),
    (r"mahkako ana", "mahkakoana"),
    (r"Migueln in", "Miguelnin"),
    (r"pipak ea", "pipakea"),
    (r"o hmai", "ohmai"),
    (r"inso n", "inson"),
    (r"p otapakea", "potapakea"),
    (r"hecho makoin", "hechomakoin"),
    (r"bikeho n", "bikehon"),
    (r"carorinabetanho n", "carorinabetanhon"),
    (r"ho ntinti", "hontinti"),
    (r"German k a mena iki", "German ka mena iki"),
    (r"nono nkesa", "nononkesa"),
    (r"Troh ishonko", "Trohishonko"),
    (r"piboko na", "pibokona"),
    (r"maba in", "mabain"),
    (r"Edelvi nanin", "Edelvinanin"),
    (r"chipa si", "chipasi"),
    (r"rohko37", "rohko"),
    (r"E ah ", "Eah "),
    (r"peshte i", "peshtei"),
    (r"oin makin", "oinmakin"),
]

suffix_sentence_replacements = [
    (r"\(.*?\)", ""),
    (r"\[", ""),
    (r"\]", ""),
    (r"Anton io", "Antonio"),
    (r"Germa n-nin", "German-nin"),
    (r"- ", "-"),
    (r" -", "-"),
    (r"nono n-kesa", "nonon-kesa"),
    (r"Troh isho-nko", "Trohisho-nko"),
    (r"pi-boko n-a", "pi-bokon-a"),
    (r"Edelvi na-nin", "Edelvina-nin"),
    (r"tsi- ", "tsi-"),
    (r"chi-pa si", "chi-pasi"),
    (r"rohko37", "rohko"),
    (r"E ah ", "Eah "),
]

annotated_sentence_replacements = [
    (r"- ", "-"),
    (r" -", "-"),
    (r"1.SG.S Chachi.Bai-LOC ir-IMPF", "1.SG.S Chachi Bai-LOC ir-IMPF"),
    (r"1.SG.ABS Chachi.Bai-LOC ir-IMPF", "1.SG.ABS Chachi Bai-LOC ir-IMPF"),
    (r"1SG.S río adentro bañars e-viniendo-PERF", "1SG.S río adentro bañarse-viniendo-PERF"),
    (r"eah Troh isho-nko ka-i", "eah Trohisho-nko ka-i"),
    (r"golpear .con.palo-MAL-VEN.TRAN-PERF", "golpear.con.palo-MAL-VEN.TRAN-PERF"),
    (r"Jeshuc o", "Jeshuco"),
    (r"golpear .con.palo", "golpear.con.palo"),
    (r"agarrar-PERF \(<NOMLZ\?\)-PLU", "agarrar-PERF(<NOMLZ?)-PLU"),
    (r"Marlonbran do", "Marlonbrando"),
    (r"persona .ABS", "persona.ABS"),
    (r" \d ", " "),
    (r"P ERF", "PERF"),
    (r"2sg. ACUS", "2sg.ACUS"),
    (r"pintar. con", "pintar.con"),
    (r"lavar se", "lavarse"),
    (r"bañars e", "bañarse"),
    (r"v\s+agina", "vagina"),
    (r"pec\s+ho", "pecho"),
    (r"1sg .A", "1sg.A"),
    (r"espalda/omóplato/ ala, pluma/caparazón", "ala"),
    (r"llev ar-PL-PERF", "llevar-PL-PERF"),
]

def clean_sentence(sentence, replacements):
    for pattern, replacement in replacements:
        sentence = re.sub(pattern, replacement, sentence)
    return sentence


new_df['spanish_sentence'] = new_df['spanish_sentence'].apply(lambda x: clean_sentence(x, spanish_sentence_replacements))
new_df['iskonawa_sentence'] = new_df['iskonawa_sentence'].apply(lambda x: clean_sentence(x, iskonawa_sentence_replacements))
new_df['suffix_sentence'] = new_df['suffix_sentence'].apply(lambda x: clean_sentence(x, suffix_sentence_replacements))
new_df['annotated_sentence'] = new_df['annotated_sentence'].apply(lambda x: clean_sentence(x, annotated_sentence_replacements))


# Todo elimiar De Bosquejo Gramatical las filas con la columna index igual a 96 y 97, pues no son de Iskonawa
print("Remove non Iskonawa sentences:")
print("Before", new_df.shape)
new_df = new_df[~(((new_df['index'] == '159') | (new_df['index'] == '160')) & new_df['reference'].str.contains("17375911/7XWQXF6G"))]
print("After", new_df.shape)

Remove non Iskonawa sentences:
Before (355, 6)
After (353, 6)


En `df_filtered` se listan aquellos casos cuya diferencia entre `iskonawa_sentence` y `suffix_sentence` no es debido a erroes de extracción, sino a diferencias en el propio libro.

In [30]:
df_filtered = new_df.copy()
df_filtered = df_filtered[df_filtered.apply(lambda x: len(x['iskonawa_sentence'].split()) != len(x['suffix_sentence'].split()), axis=1)]

print(df_filtered.shape)
df_filtered[['iskonawa_sentence', 'suffix_sentence', 'annotated_sentence', 'reference', 'index']]

(12, 6)


Unnamed: 0,iskonawa_sentence,suffix_sentence,annotated_sentence,reference,index
5,Oni chahu ka ikon iki,oni chahu ki iki,gente venado EVI.2 COP,17375911/7XWQXF6G,44
11,Chaho retea ena oina,chaho rete-a oin-a,venado.ABS matar-NOMLZ ver-PERF,17375911/7XWQXF6G,52
62,Koni ewan ka icha waka iki,koni-n ewa ka icha waka mena iki,anguila-GEN madre EVI.1 mucho río dentro COP,17375911/7XWQXF6G,113
129,Edelvina oinsi,Edelvina mia oin-is-i,Edelvina 2SG.ACU ver-PROG-IMPF,17375911/7XWQXF6G,189
220,Oinkin! ~ Oinwe!,oin-kin oin-we,ver-IMP ver-IMP,17375911/7XWQXF6G,286
261,tsi soi,tsi-soi,trasero-limpiar,17375911/PJJRIJT4,17a
262,tsi siki,tsi-siki,trasero-mirar.REFL,17375911/PJJRIJT4,17b
264,tsi tsomi,tsi-tsomi,trasero-pellizcar,17375911/PJJRIJT4,17d
300,pa choki,pa-choki,oreja-lavarse,17375911/PJJRIJT4,30d
309,Isabel bepara hako,Isabel be-para-hako,Isabel frente-aplanado-DIM,17375911/PJJRIJT4,33a


In [31]:
dict_acronyms = {
    '17375911/L2ZRAXYS': 'TOIC',
    '17375911/ZFR5PFEG': 'VICI',
    '17375911/7XWQXF6G': 'BGLI',
    '17375911/JYUAVIM5': 'IRCF',
    '17375911/PJJRIJT4': 'PPCI',
    '17375911/GSDUJ7ZH': 'RGLI',
}

# Crear una función para mapear los valores de 'reference' a sus acrónimos
def map_reference_to_acronym(reference):
    return dict_acronyms.get(reference, '')


# Aplicar la función a la columna 'reference' y concatenar con el índice
new_df['key'] = new_df['reference'].apply(map_reference_to_acronym) + new_df['index'].astype(str)


In [32]:
# En las columnas 'suffix_sentence' y 'annotated_sentence', los espacios deben ser maximo 1
# Eliminar espacios adicionales
new_df['iskonawa_sentence'] = new_df['iskonawa_sentence'].str.replace(r'\s+', ' ', regex=True)
new_df['suffix_sentence'] = new_df['suffix_sentence'].str.replace(r'\s+', ' ', regex=True)
new_df['annotated_sentence'] = new_df['annotated_sentence'].str.replace(r'\s+', ' ', regex=True)
new_df['spanish_sentence'] = new_df['spanish_sentence'].str.replace(r'\s+', ' ', regex=True)


In [33]:
print(new_df.shape)
new_df.to_json('../json/isc_sentences.json', orient='records', force_ascii=False)

(353, 7)
