# PyPDF

In [29]:
import os
import re
import sys
import pypdf
import unicodedata
from langdetect import detect, DetectorFactory
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

In [30]:
utils_dir = os.path.abspath('../src/utils')
print(f"Directorio de utils: {utils_dir}")
sys.path.append(utils_dir)

Directorio de utils: d:\PUCP\2024-2\Tesis II\chana-mt\src\utils


In [31]:
class DictionaryEntry:
    def __init__(self, word, grammatical_category, meaning):
        self.word = word
        self.grammatical_category = grammatical_category
        self.meaning = meaning

    def __repr__(self):
        return f"{self.word} ({self.grammatical_category}): {self.meaning}"

In [32]:
def extract_pages(reader, init, end):
    pages = []
    for i in range(init, end):
        page = reader.pages[i]
        pages.append(page.extract_text())
    return pages

In [33]:
def pages_to_string(pdf_pages):
    text = ''
    for page in pdf_pages:
        text += page
    return text

In [34]:
def remove_prefix_phrases(text_list):
    pattern = r"^(FORMABIAP\s*-\s*AIDESEP\s*-\s*ISEPL\d*|Diccionario\s*Awajún\d*)"

    cleaned_list = []

    for text in text_list:
        cleaned_text = re.sub(pattern, "", text).strip()
        cleaned_list.append(cleaned_text)

    return cleaned_list

In [35]:
def remove_last_line(text_list):
    pattern = r"\n\w+$"

    cleaned_list = []

    for text in text_list:
        cleaned_text = re.sub(pattern, "", text)
        cleaned_list.append(cleaned_text)

    return cleaned_list

In [36]:
def search_dictionary_entries(text_list):
    full_text = "\n".join(text_list)
    full_text = full_text.replace("\n", " ")
    full_text = ' '.join(full_text.split())

    pattern = r'(\w+ (s\.|adj\.|adv\.|conj\.|interj\.|onom\.|interr\.|pron\.|v\.))'
    entries = re.findall(pattern, full_text)

    result = [entry[0] for entry in entries]

    return result

In [37]:
def process_dictionary_entries(text_list):
    # Combine the list of text entries into a single string
    full_text = "\n".join(text_list)
    full_text = full_text.replace("\n", " ")
    full_text = ' '.join(full_text.split())

    # Regular expression pattern to capture word and meaning
    pattern = r'(\w+ (s\.|adj\.|adv\.|conj\.|interj\.|onom\.|interr\.|pron\.|v\.))(.*?)(?=\w+ (s\.|adj\.|adv\.|conj\.|interj\.|onom\.|interr\.|pron\.|v\.))'
    entries_with_text = re.findall(pattern, full_text, re.DOTALL)

    # Initialize the dictionary to store word-meaning pairs

    dictionary = []
    
    for entry in entries_with_text:
        word = entry[0].strip().split()[0]
        grammatical_category = entry[0].strip().split()[1]
        meaning = entry[2].strip()
        
        # Add the cleaned word and its meaning to the dictionary
        
        dictionary.append(DictionaryEntry(word, grammatical_category, meaning))

    return dictionary
# Problema con la letra al costado

# Lo que pasa es que a veces dicha letra de al costado, se va al inicio o al final.
# No puedo controlarlar y a veces se añade a a la primera palabra o sino al final de la oracion.

In [38]:
def remove_accents(input_str):
    """Remove accents from a string"""
    input_str = input_str.lower()
    return ''.join(c for c in unicodedata.normalize('NFD', input_str) if unicodedata.category(c) != 'Mn')

In [39]:
def process_meaning(word, meaning, grammatical_category):


    meaning = meaning.replace("||", "").strip()

    meaning = re.sub(r'[¡!¿?]', '.', meaning)

    sentences = re.split(r'\.\s*', meaning)
    sentences = [s.strip() for s in sentences if s.strip()]

    mono_agr = []
    if len(sentences) >= 2:
        first_two_sentences = sentences[:2]
        mono_agr = [first_two_sentences[0], first_two_sentences[1]]
    
    agr_sentences = []
    es_sentences = []

    search_word = word[:4] if grammatical_category == 'v.' and len(word) >= 4 else word
    search_word = remove_accents(search_word)

    i = 2
    while i < len(sentences):
        sentence = remove_accents(sentences[i])

        if re.search(rf'{re.escape(search_word)}\w*', sentence):
            if(i != len(sentences)-1):
                agr_sentences.append(sentences[i])
            
            if i + 1 <len(sentences):
                es_sentences.append(sentences[i + 1])
            i += 2
        else:
            i += 1

    return mono_agr, agr_sentences, es_sentences

In [40]:
def process_multiple_entries(entries):
    mono_agr_all = []
    agr_sentences_all = []
    es_sentences_all = []
    size = len(entries)
    i = 0
    for entry in entries:
        mono_agr, agr_sentences, es_sentences = process_meaning(entry.word, entry.meaning, entry.grammatical_category)

        mono_agr_all.extend(mono_agr)
        agr_sentences_all.extend(agr_sentences)
        es_sentences_all.extend(es_sentences)
        if (len(agr_sentences) != len(es_sentences)):
            print(f'{i}/{size} - {entry.word}')
        i += 1

    return mono_agr_all, agr_sentences_all, es_sentences_all

In [41]:
DetectorFactory.seed = 0

nlp_es = spacy.load("es_core_news_sm")

@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector()

nlp_es.add_pipe("language_detector", last=True)

def is_spanish(sentence):
    try:
        return detect(sentence) == 'es'
    except Exception:
        return False # In case it doesn't recognize the language
    
def has_spanish_words(sentence):
    doc = nlp_es(sentence)
    detect_language = doc._.language
    return detect_language['language'] == 'es'

def filter_sentences(text):
    text = text.replace("\n", "")
    sentences = re.split(r'(?<=[.!?])', text)
    filtered_sentences = [
        sentence for sentence in sentences
        if not (is_spanish(sentence) or has_spanish_words(sentence))
    ]

    result = "\n".join(filtered_sentences)
    return result

In [42]:
def add_to_file(text, output_file):
    with open(output_file, 'a') as file:
        for sentence in text:
            file.write(sentence + '\n')

In [43]:
pdf_path=('../data/pdf/Diccionario-Awajun-Castellano.pdf')
file_mono_agr_path = "../data/raw/1_Diccionario Awajun-es.agr"
file_agr_path = "../data/raw/1_Diccionario Awajun-Castellano.agr"
file_es_path = "../data/raw/1_Diccionario Awajun-Castellano.es"
reader = pypdf.PdfReader(pdf_path)

In [44]:
page = reader.pages[8]
page.extract_text()

'FORMABIAP - AIDESEP - ISEPL8áabai  adj. Etsa yumigkagtug inagnak “Kusui \naatus áabai” yumignum chichatai. Ha \nde ser. Es una expresión de conjuro, sea bendiciendo o maldiciendo. Término que utilizaron los personajes de la cosmología awajún, Etsa, Nugkui, Mamuk durante la evolución humana para la creación de las cosas, relacionado a hechos y creaciones. Aátus áabai. Que así sea.\naágkamu s. Nugka atak takasmig tusa takatsuk \nkuitamtai, atak uchi tsakainak takastinme tusa anettsamu. Reserva. Nugka aágkamu. \nTerreno en reserva. || Espaciado, ralo. \nTanish aágkamu. El cercado de la casa está espaciado. || adj. Marginado. Shawitak \naágkamu juwakme. Shawit ha quedado marginado. \náagkeas s. Shigki etsaka najanamu, yaunchuk \nmuun maaninak takajaku ainawai. Etsa Ajaimpijai yakí uyai tukut nunú japa wampun kautma nuna dakak máati tusa áagkeasan najanawag dekapdaisajui. Lanza de pona. Arma ofensiva labrada, hecha de pona, con punta fina con lados adentados y encorvados, utilizada en la g

In [45]:
page = reader.pages[45]
page.extract_text()

'Diccionario Awajún45ayú adv. Sujuinakui, ipatainakui aíbau. \nSegatainakui  sujimtsuk aíbau.  Bien. \n¡Bueno!, ¡Ya!, ¡Está bien! expresión de aceptación.  Ayú, yamai minash ujatkata. Bien, ahora tu cuéntame. \nayúi  s. Jacha numiji. Aéntsu daaji. Mango de \nhacha. Jacha ayúiji kupinkame. El mango \ndel hacha se ha quebrado. Diich Ayúi shiig nantsemnuwe. El tío Ayúi danza muy bien. \nayúgkamat v. Utugchata epegtsuk ayatak tutitag \netumat. Complicar. Diich nawanji utug-chatjin epegtuatakama ayatak ayúgkama juwakme. Mi tío en vez de solucionar el problema de su hija lo ha complicado.   \nayújut  v. Uchi, kuntin tagkumamu, aénts jau \nayúgmau. Dar de comer, alimentar, nutrir. Mamai kijus tagkumamun dautua ayujui. Mamai le dá de comer al pihuicho. \nayúknat v. Uchi tsakat wais ayúknamu. Uchi \nyunchitan achimaku páampa puwajin ayúknamu. Hacer lavarse la boca con agua. Nugkui umajin ayúknawai. Nugkui le hace lavar la boca a su hermano. \náyum s. Atash, paapu, patu ayum. Aves del corral \nm

In [46]:
pages_a = extract_pages(reader,8,46)
pages_b = extract_pages(reader,47,57)
pages_ch = extract_pages(reader,59,65)
pages_d = extract_pages(reader,67,77)
pages_e = extract_pages(reader,79,86)
pages_i = extract_pages(reader,88,100)
pages_j = extract_pages(reader,102,109)
pages_k = extract_pages(reader,111,126)
pages_m = extract_pages(reader,128,132)
pages_n = extract_pages(reader,134,140)
pages_p = extract_pages(reader,142,152)
pages_s = extract_pages(reader,154,159)
pages_sh = extract_pages(reader,161,164)
pages_t = extract_pages(reader,166,177)
pages_ts = extract_pages(reader,179,185)
pages_u = extract_pages(reader,187,196)
pages_w = extract_pages(reader,198,208)
pages_y = extract_pages(reader,210,218)

In [47]:
print(pages_a[37])

Diccionario Awajún45ayú adv. Sujuinakui, ipatainakui aíbau. 
Segatainakui  sujimtsuk aíbau.  Bien. 
¡Bueno!, ¡Ya!, ¡Está bien! expresión de aceptación.  Ayú, yamai minash ujatkata. Bien, ahora tu cuéntame. 
ayúi  s. Jacha numiji. Aéntsu daaji. Mango de 
hacha. Jacha ayúiji kupinkame. El mango 
del hacha se ha quebrado. Diich Ayúi shiig nantsemnuwe. El tío Ayúi danza muy bien. 
ayúgkamat v. Utugchata epegtsuk ayatak tutitag 
etumat. Complicar. Diich nawanji utug-chatjin epegtuatakama ayatak ayúgkama juwakme. Mi tío en vez de solucionar el problema de su hija lo ha complicado.   
ayújut  v. Uchi, kuntin tagkumamu, aénts jau 
ayúgmau. Dar de comer, alimentar, nutrir. Mamai kijus tagkumamun dautua ayujui. Mamai le dá de comer al pihuicho. 
ayúknat v. Uchi tsakat wais ayúknamu. Uchi 
yunchitan achimaku páampa puwajin ayúknamu. Hacer lavarse la boca con agua. Nugkui umajin ayúknawai. Nugkui le hace lavar la boca a su hermano. 
áyum s. Atash, paapu, patu ayum. Aves del corral 
macho. || Uchi 

In [48]:
pages_a = remove_prefix_phrases(pages_a)
pages_b = remove_prefix_phrases(pages_b)
pages_ch = remove_prefix_phrases(pages_ch)
pages_d = remove_prefix_phrases(pages_d)
pages_e = remove_prefix_phrases(pages_e)
pages_i = remove_prefix_phrases(pages_i)
pages_j = remove_prefix_phrases(pages_j)
pages_k = remove_prefix_phrases(pages_k)
pages_m = remove_prefix_phrases(pages_m)
pages_n = remove_prefix_phrases(pages_n)
pages_p = remove_prefix_phrases(pages_p)
pages_s = remove_prefix_phrases(pages_s)
pages_sh = remove_prefix_phrases(pages_sh)
pages_t = remove_prefix_phrases(pages_t)
pages_ts = remove_prefix_phrases(pages_ts)
pages_u = remove_prefix_phrases(pages_u)
pages_w = remove_prefix_phrases(pages_w)
pages_y = remove_prefix_phrases(pages_y)

In [49]:
pages_a = remove_last_line(pages_a)
pages_b = remove_last_line(pages_b)
pages_ch = remove_last_line(pages_ch)
pages_d = remove_last_line(pages_d)
pages_e = remove_last_line(pages_e)
pages_i = remove_last_line(pages_i)
pages_j = remove_last_line(pages_j)
pages_k = remove_last_line(pages_k)
pages_m = remove_last_line(pages_m)
pages_n = remove_last_line(pages_n)
pages_p = remove_last_line(pages_p)
pages_s = remove_last_line(pages_s)
pages_sh = remove_last_line(pages_sh)
pages_t = remove_last_line(pages_t)
pages_ts = remove_last_line(pages_ts)
pages_u = remove_last_line(pages_u)
pages_w = remove_last_line(pages_w)
pages_y = remove_last_line(pages_y)

In [50]:
print(len(pages_a))
print(pages_a[0])

38
áabai  adj. Etsa yumigkagtug inagnak “Kusui 
aatus áabai” yumignum chichatai. Ha 
de ser. Es una expresión de conjuro, sea bendiciendo o maldiciendo. Término que utilizaron los personajes de la cosmología awajún, Etsa, Nugkui, Mamuk durante la evolución humana para la creación de las cosas, relacionado a hechos y creaciones. Aátus áabai. Que así sea.
aágkamu s. Nugka atak takasmig tusa takatsuk 
kuitamtai, atak uchi tsakainak takastinme tusa anettsamu. Reserva. Nugka aágkamu. 
Terreno en reserva. || Espaciado, ralo. 
Tanish aágkamu. El cercado de la casa está espaciado. || adj. Marginado. Shawitak 
aágkamu juwakme. Shawit ha quedado marginado. 
áagkeas s. Shigki etsaka najanamu, yaunchuk 
muun maaninak takajaku ainawai. Etsa Ajaimpijai yakí uyai tukut nunú japa wampun kautma nuna dakak máati tusa áagkeasan najanawag dekapdaisajui. Lanza de pona. Arma ofensiva labrada, hecha de pona, con punta fina con lados adentados y encorvados, utilizada en la guerra y en la caza.
aágket    v.   

In [51]:
dict_a = process_dictionary_entries(pages_a)
dict_b = process_dictionary_entries(pages_b)
dict_ch = process_dictionary_entries(pages_ch)
dict_d = process_dictionary_entries(pages_d)
dict_e = process_dictionary_entries(pages_e)
dict_i = process_dictionary_entries(pages_i)
dict_j = process_dictionary_entries(pages_j)
dict_k = process_dictionary_entries(pages_k)
dict_m = process_dictionary_entries(pages_m)
dict_n = process_dictionary_entries(pages_n)
dict_p = process_dictionary_entries(pages_p)
dict_s = process_dictionary_entries(pages_s)
dict_sh = process_dictionary_entries(pages_sh)
dict_t = process_dictionary_entries(pages_t)
dict_ts = process_dictionary_entries(pages_ts)
dict_u = process_dictionary_entries(pages_u)
dict_w = process_dictionary_entries(pages_w)
dict_y = process_dictionary_entries(pages_y)

In [52]:
for entry in dict_a:
    print(entry)

áabai (adj.): Etsa yumigkagtug inagnak “Kusui aatus áabai” yumignum chichatai. Ha de ser. Es una expresión de conjuro, sea bendiciendo o maldiciendo. Término que utilizaron los personajes de la cosmología awajún, Etsa, Nugkui, Mamuk durante la evolución humana para la creación de las cosas, relacionado a hechos y creaciones. Aátus áabai. Que así sea.
aágkamu (s.): Nugka atak takasmig tusa takatsuk kuitamtai, atak uchi tsakainak takastinme tusa anettsamu. Reserva. Nugka aágkamu. Terreno en reserva. || Espaciado, ralo. Tanish aágkamu. El cercado de la casa está espaciado. || adj. Marginado. Shawitak aágkamu juwakme. Shawit ha quedado marginado.
áagkeas (s.): Shigki etsaka najanamu, yaunchuk muun maaninak takajaku ainawai. Etsa Ajaimpijai yakí uyai tukut nunú japa wampun kautma nuna dakak máati tusa áagkeasan najanawag dekapdaisajui. Lanza de pona. Arma ofensiva labrada, hecha de pona, con punta fina con lados adentados y encorvados, utilizada en la guerra y en la caza.
aágket (v.): Juka 

In [24]:
mono_agr_a, agr_sentences_a, es_sentences_a= process_multiple_entries(dict_a)
mono_agr_b, agr_sentences_b, es_sentences_b= process_multiple_entries(dict_b)
mono_agr_ch, agr_sentences_ch, es_sentences_ch= process_multiple_entries(dict_ch)
mono_agr_d, agr_sentences_d, es_sentences_d= process_multiple_entries(dict_d)
mono_agr_e, agr_sentences_e, es_sentences_e= process_multiple_entries(dict_e)
mono_agr_i, agr_sentences_i, es_sentences_i= process_multiple_entries(dict_i)
mono_agr_j, agr_sentences_j, es_sentences_j= process_multiple_entries(dict_j)
mono_agr_k, agr_sentences_k, es_sentences_k= process_multiple_entries(dict_k)
mono_agr_m, agr_sentences_m, es_sentences_m= process_multiple_entries(dict_m)
mono_agr_n, agr_sentences_n, es_sentences_n= process_multiple_entries(dict_n)
mono_agr_p, agr_sentences_p, es_sentences_p= process_multiple_entries(dict_p)
mono_agr_s, agr_sentences_s, es_sentences_s= process_multiple_entries(dict_s)
mono_agr_sh, agr_sentences_sh, es_sentences_sh= process_multiple_entries(dict_sh)
mono_agr_t, agr_sentences_t, es_sentences_t= process_multiple_entries(dict_t)
mono_agr_ts, agr_sentences_ts, es_sentences_ts= process_multiple_entries(dict_ts)
mono_agr_u, agr_sentences_u, es_sentences_u= process_multiple_entries(dict_u)
mono_agr_w, agr_sentences_w, es_sentences_w= process_multiple_entries(dict_w)
mono_agr_y, agr_sentences_y, es_sentences_y= process_multiple_entries(dict_y)

In [25]:
print(len(dict_a))
print(len(dict_b))
print(len(dict_ch))
print(len(dict_d))
print(len(dict_e))
print(len(dict_i))
print(len(dict_j))
print(len(dict_k))
print(len(dict_m))
print(len(dict_n))
print(len(dict_p))
print(len(dict_s))
print(len(dict_sh))
print(len(dict_t))
print(len(dict_ts))
print(len(dict_u))
print(len(dict_w))
print(len(dict_y))


680
155
130
223
172
322
227
474
117
198
316
163
80
349
190
270
288
210


In [26]:
add_to_file(mono_agr_a,file_mono_agr_path)
add_to_file(mono_agr_b,file_mono_agr_path)
add_to_file(mono_agr_ch,file_mono_agr_path)
add_to_file(mono_agr_d,file_mono_agr_path)
add_to_file(mono_agr_e,file_mono_agr_path)
add_to_file(mono_agr_i,file_mono_agr_path)
add_to_file(mono_agr_j,file_mono_agr_path)
add_to_file(mono_agr_k,file_mono_agr_path)
add_to_file(mono_agr_m,file_mono_agr_path)
add_to_file(mono_agr_n,file_mono_agr_path)
add_to_file(mono_agr_p,file_mono_agr_path)
add_to_file(mono_agr_s,file_mono_agr_path)
add_to_file(mono_agr_sh,file_mono_agr_path)
add_to_file(mono_agr_t,file_mono_agr_path)
add_to_file(mono_agr_ts,file_mono_agr_path)
add_to_file(mono_agr_u,file_mono_agr_path)
add_to_file(mono_agr_w,file_mono_agr_path)
add_to_file(mono_agr_y,file_mono_agr_path)

In [27]:
add_to_file(agr_sentences_a,file_agr_path)
add_to_file(agr_sentences_b,file_agr_path)
add_to_file(agr_sentences_ch,file_agr_path)
add_to_file(agr_sentences_d,file_agr_path)
add_to_file(agr_sentences_e,file_agr_path)
add_to_file(agr_sentences_i,file_agr_path)
add_to_file(agr_sentences_j,file_agr_path)
add_to_file(agr_sentences_k,file_agr_path)
add_to_file(agr_sentences_m,file_agr_path)
add_to_file(agr_sentences_n,file_agr_path)
add_to_file(agr_sentences_p,file_agr_path)
add_to_file(agr_sentences_s,file_agr_path)
add_to_file(agr_sentences_sh,file_agr_path)
add_to_file(agr_sentences_t,file_agr_path)
add_to_file(agr_sentences_ts,file_agr_path)
add_to_file(agr_sentences_u,file_agr_path)
add_to_file(agr_sentences_w,file_agr_path)
add_to_file(agr_sentences_y,file_agr_path)

In [28]:
add_to_file(es_sentences_a,file_es_path)
add_to_file(es_sentences_b,file_es_path)
add_to_file(es_sentences_ch,file_es_path)
add_to_file(es_sentences_d,file_es_path)
add_to_file(es_sentences_e,file_es_path)
add_to_file(es_sentences_i,file_es_path)
add_to_file(es_sentences_j,file_es_path)
add_to_file(es_sentences_k,file_es_path)
add_to_file(es_sentences_m,file_es_path)
add_to_file(es_sentences_n,file_es_path)
add_to_file(es_sentences_p,file_es_path)
add_to_file(es_sentences_s,file_es_path)
add_to_file(es_sentences_sh,file_es_path)
add_to_file(es_sentences_t,file_es_path)
add_to_file(es_sentences_ts,file_es_path)
add_to_file(es_sentences_u,file_es_path)
add_to_file(es_sentences_w,file_es_path)
add_to_file(es_sentences_y,file_es_path)