## Scrape & Preprocess Text

Import Libraries

In [10]:
import os
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import pandas as pd
import time, random
import numpy as np
import geckodriver_autoinstaller

#### Scrape Dinamically Generated Pages from chileconvencion.cl

In [31]:
geckodriver_autoinstaller.install()
driver = webdriver.Firefox()
driver.implicitly_wait(20)

In [9]:
#chromedriver = '/Users/lw/Desktop/chromedriver'
#os.environ['webdriver.chrome.driver'] = \
#        chromedriver
#driver = webdriver.Chrome(chromedriver)
#driver.implicitly_wait(20)

In [32]:
driver.get('https://www.chileconvencion.cl/documentos/')
driver.find_element_by_id('nav-iniciativas-tab').click() #Navigate to page containing documents of interest.

  driver.find_element_by_id('nav-iniciativas-tab').click() #Navigate to page containing documents of interest.


In [34]:
driver.implicitly_wait(60)

Scrape html of documents table in each page.

In [35]:
docs_table = []
for i in range(96):
    soup = bs(driver.page_source)
    [docs_table.append(doc) for doc in soup.find(id='tableIniciativas').find_all(class_='even')]
    [docs_table.append(doc) for doc in soup.find(id='tableIniciativas').find_all(class_='odd')]
    if i != 95:
        driver.find_element_by_id('tableIniciativas_next').click()
    time.sleep(np.random.uniform(0,3))
    

  driver.find_element_by_id('tableIniciativas_next').click()


Retrieve url of norm proposals and associated info data

In [36]:
urls = [doc.find_all('td')[5].find('a')['href'] for doc in docs_table]
nr = [doc.find_all('td')[0].text for doc in docs_table]

df = pd.DataFrame(urls, index=nr, columns=['doc_url'])
df['name'] = [doc.find_all('td')[1].text for doc in docs_table]
df['bulletin'] = [doc.find_all('td')[2].text for doc in docs_table]
df['topic'] = [doc.find_all('td')[3].text for doc in docs_table]
df['commission'] = [doc.find_all('td')[4].text for doc in docs_table]

In [38]:
print(df[~df.doc_url.str.contains('.pdf')].shape)
missing_url_index = df[~df.doc_url.str.contains('.pdf')].index
#print(docs_table[nr.index(missing_url_index)].find_all('td')[-1].find('a')) #Problem from the site
df.drop(index=missing_url_index, inplace=True, errors='ignore') #Drop initiative w missing url from df

(3, 5)


In [59]:
df.shape

(950, 5)

#### Separate Proposals for text extraction.

There are special Norm Proposals with different formatting:
- Indigenous Proposals
- Popular Proposals
- Pdfs containing multiple proposals

In [39]:
import re

In [40]:
df['name'] = df.name.apply(lambda x: re.sub(f'\xa0', ' ', x)) # invisible formatting that hinders search
df['name'] = df.name.apply(lambda x: re.sub(f'  ', ' ', x))

ind_df = df[df.name.str.contains('ativa Popular Indígena')]
pop_df = df[df.name.str.contains('Iniciativa Popular Constituyente')]
multipl_df = df[df.name.str.contains('Iniciativas')] #Special PDFs that contain multiple proposals
conv_df = df[df.name.str.contains('iva Conv')|df.name.str.contains('iva Ind')]

In [43]:
df['name'] = df.name.apply(lambda x: re.sub(f'\xa0', ' ', x)) # invisible formatting that hinders search
df['name'] = df.name.apply(lambda x: re.sub(f'  ', ' ', x))

ind_df = df[df.name.str.lower().str.contains(r'ativa popular indígena')]
pop_df = df[df.name.str.lower().str.contains('iniciativa popular constituyente')]
multipl_df = df[df.name.str.lower().str.contains('iniciativas')] #Special PDFs that contain multiple proposals
conv_df = df[df.name.str.lower().str.contains('iva conv')|df.name.str.contains('iva Ind')]

In [62]:
info_dfs = [ind_df,pop_df,multipl_df,conv_df]
unassigned = df[~df.index.isin(pd.concat(info_dfs).index)]
nr_docs = sum([df.shape[0] for df in info_dfs])
nr_docs, nr_docs==df.shape[0] # Sanity Check

(950, True)

In [53]:
conv_df = conv_df.append(df.loc[[ind for ind in unassigned.index]])

  conv_df = conv_df.append(df.loc[[ind for ind in unassigned.index]])


In [61]:
ind_df.index = ind_df.index + '_ind'
pop_df.index = pop_df.index + '_pop'
multipl_df.index = multipl_df.index + '_mult'
conv_df.index = conv_df.index + '_conv'

In [63]:
df_file_labels = ['ind_info', 'pop_info', 'mult_info', 'conv_info']
for i in range(4):
    info_dfs[i].to_csv(f'Data/{df_file_labels[i]}.csv')

#### Get PDFs from urls

In [4]:
import requests
from fake_useragent import UserAgent

In [13]:
import pandas as pd
import time, random
import numpy as np

In [80]:
ua = UserAgent()
header1 = {'User-Agent':str(ua.msie)}
header2 = {'User-Agent':str(ua.chrome)}
def get_PDFs(df, folder):
    for i in df.index:
        url = df.loc[i,'doc_url']
        try:
            response = requests.get(url, headers=header1)
        except BaseException as error:
            try:
                time.sleep(np.random.uniform(0,5))
                response = requests.get(url, headers=header2)
            except BaseException as error:
                print(error)
                print(i)
                break
        with open(f'PDFs/{folder}/{i}.pdf', 'wb') as f:
            f.write(response.content)
        time.sleep(np.random.uniform(0,5))

In [74]:
def get_PDFs2(df, folder):
    for i in df.index:
        url = df.loc[i,'doc_url']
        try:
            response = requests.get(url, headers={'User-Agent':str(ua.random)})
        except BaseException as error:
            try:
                time.sleep(np.random.uniform(0,4))
                response = requests.get(url, headers={'User-Agent':str(ua.random)})
            except BaseException as error:
                print(error)
                print(i)
                break
        with open(f'PDFs/{folder}/{i}.pdf', 'wb') as f:
            f.write(response.content)
        time.sleep(np.random.uniform(0,4))
    os.system("printf '\a'")

In [11]:
ind_df = pd.read_csv('Data/ind_info.csv', index_col=0)
conv_df = pd.read_csv('Data/conv_info.csv', index_col=0)

In [92]:
pop_df = pd.read_csv('Data/pop_info.csv', index_col=0)

In [15]:
get_PDFs(ind_df, 'Indigenas'), get_PDFs(conv_df, 'Convencionales')
get_PDFs(multipl_df, 'Multiples'), get_PDFs(pop_df, 'Populares')

HTTPSConnectionPool(host='www.chileconvencion.cl', port=443): Max retries exceeded with url: /wp-content/uploads/2022/02/741-Iniciativa-Convencional-Constituyente-del-cc-Ignacio-Achurra-sobre-Espectro-Radioelectrico-01-02.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x14143d630>, 'Connection to www.chileconvencion.cl timed out. (connect timeout=None)'))
13_conv_conv


(None, None)

### Get text from PDFs
Different proposal types have different formating (some are scanned and need ocr)

In [94]:
import textract
import re

In [95]:
utf_chars = ({r'\\xc3\\xa1':r'á',r'\\xc3\\xa9':r'é',r'\\xc3\\xad':r'í',r'\\xc3\\xb3':r'ó',
             r'\\xc3\\xba':r'ú',r'\\xc3\\xb1':r'ñ',r'\\xef\\xac\\x81':r'fi',
             r'\\xe2\\x80\\x9c':r'"',r'\\xe2\\x80\\x9d':r'"',r'\\xef\\xac\\x82':r'fl',
             r'\\xc2\\xb0':r'',r'\\xc3\\x93':r'Ó',r'\\xc2\\xba':r'',r'\\xc3\\x91':r'Ñ',
             r'\\xe2\\x80\\x93':r'-',r'\\xc3\\x81':r'Á',r'\\xc3\\x8d':r'Í',r'\\xc2\\xa1':r'',
             r'\\xef\\x82\\xb7':r'','\xc2\xab':r'','\xc2\xbb':r'',r'\\x0c':r'',r'(?<=\w)([\\]n)((?=[a-z]))':r' '})
def conv_chars(string, chars=utf_chars):
    for utf, char in chars.items():
        string=re.sub(utf, char, string)
    return string

In [None]:
text_str = str(text)
for i in (re.findall(r'https(.*?)\s', text_str)):
    text_str = re.sub(f'r"{i}"', r'', text_str)

In [96]:
def extract_text_pdf(directory):
    docs, names, errors = [], [], [] #dict()
    for file in os.listdir(directory):
        try:
            text = textract.process(f'{directory}/{file}', method='pdfminer')
            docs.append(text), names.append(re.sub(r'.pdf', r'', file))
        except UnicodeDecodeError: #BaseException as error:
            errors.append(file) #errors[file:error]
            continue
    return docs, names, errors

###### Indigenous Proposals

In [102]:
#ind_docs, ind_docs_ind, ind_errors = extract_text_pdf('PDFs/Indigenas')
pop_docs, pop_docs_ind, pop_errors = extract_text_pdf('PDFs/Populares')

In [99]:
# Preprocess indigenous initiatives
ind_docs_clean, ind_docs_error = [], []
for text in ind_docs:
    text = conv_chars(str(text))
    text = re.findall(r'n de la norma(.*?)Archivos Adjuntos', text)
    if len(text)==1:
        ind_docs_clean.append(text[0]),
    else:
        ind_docs_error.append(ind_docs.index(text))
len(ind_docs_error)

0

In [100]:
# Due to the formatting and extraction, sometimes the proposal topic is printed
# after the argumentation begins.
topic_regex = r'[\\]{1,2}n[\\]{1,2}n.+?[\\]{1,2}n\d\s-\s.+?[\\]{1,2}n[\\]n'
ind_docs_clean = [re.sub(topic_regex, r'', doc) for doc in ind_docs_clean]

In [101]:
ind_doc = pd.DataFrame(ind_docs_clean, index=ind_docs_ind, columns=['text'])
ind_doc.to_csv('Data/ind_docs.csv')
ind_doc.head(3)

Unnamed: 0,text
270_ind_ind,\n\nSomos una comunidad muy participativa y de...
312_ind_ind,"\n\nPara la presentación de esta propuesta, el..."
362_ind_ind,\n\nI) GENERALIDADES. \nLa presente propuesta...


Indigenous proposals in PDF with multiple proposals

In [None]:
text = conv_chars(str(textract.process('PDFs/Multiples/1_mult.pdf', method='pdfminer')))
texts = re.findall(r'Construcción de la norma(.*?)Archivos Adjuntos', text)
texts = [re.sub(topic_regex, r'', text) for doc in ind_docs_clean]
ind_mult_ind = [f'{i}_ind_m' for i in range(len(texts))]
ind_mult_doc = pd.DataFrame(texts, index=ind_mult_ind, columns=['text'])
ind_mult_doc.to_csv('Data/ind_mult_docs.csv')

###### Popular Proposals

In [151]:
# Preprocess popular initiatives
pop_docs_clean, pop_docs_error = [], []
for doc in pop_docs:
    text = conv_chars(str(doc))
    text_sm = re.findall(r'Problema a Solucionar(.*?)Archivos Adjuntos', text)
    if len(text_sm) != 1:
        #pop_docs_error.append(pop_docs.index(doc))
        text_sm = re.findall(r'Problema a Solucionar(.+)', text)
        if len(text_sm) != 1:
            text_sm = re.findall(r'PROBLEMA A SOLUCIONAR(.+)', text)
            if len(text_sm) !=1:
                text_sm = re.findall(r'Problema Por Solucionar(.+)', text)
                if len(text_sm) != 1:
                    pop_docs_error.append(pop_docs.index(doc))
    pop_docs_clean.append(text_sm[0]) #pop_docs_clean.append(text_sm[0])
len(pop_docs_error)

0

In [153]:
# Due to the formatting and extraction, sometimes the proposal topic is printed
# after the argumentation begins.
sidebox_regex = r'ESTA\sPROPUESTA\sCUENTA\sCON.+DISPONIBLE\sDESDE'
pop_docs_clean = [re.sub(sidebox_regex, r'', doc) for doc in pop_docs_clean]

In [156]:
pop_doc = pd.DataFrame(pop_docs_clean, index=pop_docs_ind, columns=['text'])
pop_doc.to_csv('Data/pop_docs.csv')
pop_doc.head(3)

Unnamed: 0,text
333_pop_pop,\n\nFalta de perspectiva de género y enfoque i...
343_pop_pop,\n\nExisten múltiples expresiones de creencias...
538_pop_pop,\n\nSituación Ideal\n\n


Popular proposals in PDF with multiple proposals

In [None]:
#mult_pop_text = conv_chars(str(textract.process('PDFs/Multiples/2_mult.pdf', method='pdfminer')))
#pop_regex = r'Problema a Solucionar(.*?)Archivos Adjuntos'
#pop_texts = re.findall(pop_regex, mult_pop_text)
#pop_texts = [re.sub(topic_regex, '', text) for text in pop_texts]

pop_mult_ind = [f'{i}_pop_m' for i in range(len(pop_texts))]
pop_mult_doc = pd.DataFrame(pop_texts, columns=['text'], index=pop_mult_ind)
pop_mult_doc.to_csv('Data/pop_mult_docs.csv')
len(pop_texts)==78-53

Assembly Proposals

In [None]:
# Transform Assembly initiatives
test = textract.process('PDFs/Convencionales/80.pdf', method='tesseract', language='spa')

In [None]:
conv_docs = []
conv_ind = []

for pdf in os.listdir('PDFs/Convencionales')[:5]:
    text = textract.process(f'PDFs/Convencionales/{pdf}', method='tesseract', language='spa')
    text = conv_chars(str(text))
    if len(text)>10:
        conv_docs.append(spa_char(str(text)))
        conv_ind.append(re.sub(r'.pdf', '_conv', pdf))

#conv_doc = pd.DataFrame(conv_docs, index=conv_ind)
#conv_doc.to_csv('Data/conv_docs.csv', index=False)

In [None]:
len(conv_docs), len(conv_ind)

In [None]:
failed = [doc for doc in conv_docs if len(doc)<100]
failed_ind = [conv_docs.index(doc) for doc in conv_docs if len(doc)<1000]
#[conv_docs.pop(i) for i in failed_ind]
#[conv_ind.pop(i) for i in failed_ind] #FIX IF TIME #from pdf2image import convert_from_path
#len(conv_docs), len(conv_ind)

Popular proposals in PDF with multiple proposals

#### Create corpus

In [None]:
ind_df = pd.read_csv('Data/ind_docs.csv', names=['text'], skiprows=[0])
pop_df = pd.read_csv('Data/pop_docs.csv', names=['text'], skiprows=[0])
conv_df = pd.read_csv('Data/conv_docs.csv', names=['text'], skiprows=[0])
mult_ind_df = pd.read_csv('Data/mult_ind_docs.csv', names=['text'], skiprows=[0])
mult_pop_df = pd.read_csv('Data/pop_mult_docs.csv', names=['text'], skiprows=[0])

In [None]:
conv_doc = pd.DataFrame(conv_docs, index=conv_ind, columns=['text']).head(2)
conv_doc.to_csv('Data/conv_docs.csv', index=conv_ind)

In [None]:
dfs = [mult_pop_df, mult_ind_df, conv_df, pop_df, ind_df]

In [None]:
corpus = []
for df in dfs:
    [corpus.append(spa_char(doc).split('\\n\\n')) for doc in df.text]

In [None]:
corpus_no_s = []
for doc in corpus:
    corpus_no_s.append([re.sub(r'\\n', r' ', paragraph) for paragraph in doc])

In [None]:
len(corpus_no_s)

In [None]:
corpus_par = []
for doc in corpus_no_s:
    for par in doc:
        if par != '':
            corpus_par.append(par)

In [None]:
corpus[0]

In [None]:
print(len(corpus))
print(sum([len(d.split(' ')) for d in corpus]))

In [None]:
from 

In [None]:
len(os.listdir('PDFs/Convencionales'))

#### Tests

In [None]:
df.shape

In [None]:
df[df.topic.str.contains('Colegios profesionales')]

In [None]:
#sorted([re.sub(r'.pdf', '', pdf) for pdf in os.listdir('PDFs/Indigenas')])

In [None]:
#pd.read_csv('Data/ind_docs.csv')

In [None]:
sp_text = spa_char(str(text))

In [None]:
#big_ind_text = textract.process(f'PDFs/Multiples/1.pdf', method='pdfminer')
big_ind_text = spa_char(str(big_ind_text))
big_ind_iter = re.findall(r'n de la norma(.*?)Archivos Adjuntos', big_ind_text)

#### Old code

In [None]:
def get_PDF(df, folder):
    for i in df.index:
        url = df.loc[i,'doc_url']
        try:
            response = requests.get(url)
        except BaseException as error:
            print(error)
            print(i)
            continue
        with open(f'PDFs/{folder}/{i}.pdf', 'wb') as f:
            f.write(response.content)

In [None]:
nr, nombre, boletin, materia, comision, url = [], [], [], [], [], []

for doc in docs_table:
    elems = docs_table[-1].find_all('td')
    nr.append(elems[0].text)
    nombre.append(elems[1].text)
    boletin.append(elems[2].text)
    materia.append(elems[3].text)
    comision.append(elems[4].text)
    url.append(elems[5].find('a')['href'])

In [None]:
import PyPDF2
file = open('PDFs/1.pdf', 'rb')
fileReader = PyPDF2.PdfFileReader(file)
print(fileReader.numPages)
text = fileReader.getPage(0).extractText()

In [None]:
# Sanity check
m1 = df.name.str.contains('ativa Popular Indígena')
m2 = df.name.str.contains('Iniciativa Popular Constituyente')
m3 = df.name.str.contains('Iniciativa Convenci')
m4 = df.name.str.contains('Iniciativas')
all_df = df[m1|m2|m3|m4]
df[~df.index.isin(all_df.index)]

In [None]:
for doc_index in ind_df.index:
    url = ind_df.loc[doc_index,'doc_url']
    response = requests.get(url)
    with open(f'PDFs/Indigenas/{doc_index}.pdf', 'wb') as f:
        f.write(response.content)
for doc_index in conv_df.index:
    url = conv_df.loc[doc_index,'doc_url']
    response = requests.get(url)
    with open(f'PDFs/Convencionales/{doc_index}.pdf', 'wb') as f:
        f.write(response.content)
for doc_index in multipl_df.index:
    url = multipl_df.loc[doc_index,'doc_url']
    response = requests.get(url)
    with open(f'PDFs/Multiples/{doc_index}.pdf', 'wb') as f:
        f.write(response.content)
for doc_index in pop_df.index:
    url = pop_df.loc[doc_index,'doc_url']
    response = requests.get(url)
    with open(f'PDFs/Populares/{doc_index}.pdf', 'wb') as f:
        f.write(response.content)