# Fase 01 - Obtenció de dades

In [83]:
from downloadDiaryOfSessions.main import download
import json
from os.path import basename, isdir, isfile
from os import listdir, makedirs
import pandas as pd
from pandas import json_normalize
from pdf2image import convert_from_path
import requests
import subprocess

## 1.- Descarregar diaris de sessions

In [89]:
legislatures = [12, 13, 14]
max_errors = 50
dscd_path = './.data/diary_session_congress_deputies_pdfs'
dss_path = './.data/diary_session_senate_pdfs'

def diary_session_congress_deputies_url(legislature, plenary):
    return f"https://www.congreso.es/public_oficiales/L{legislature}/CONG/DS/PL/DSCD-{legislature}-PL-{plenary}.PDF"

def diary_session_senate_url(legislature, plenary):
    return f"https://www.senado.es/legis{legislature}/publicaciones/pdf/senado/ds/DS_C_{legislature}_{plenary}.PDF"

### 1.1.- Diaris de sessions del congress de diputats

In [91]:
if not isdir(dscd_path):
    makedirs(dscd_path)

for current_legislature in legislatures:
    current_plenary = 1
    consecutive_errors = 0
    while consecutive_errors < max_errors:
        url = diary_session_congress_deputies_url(current_legislature, current_plenary)
        file = f"dscd-{current_legislature}-{current_plenary:03d}"
        if download(url, dscd_path, file):
            consecutive_errors = 0
        else:
            consecutive_errors += 1

        current_plenary += 1

### 1.2.- Diaris de sessions del senat

In [92]:
if not isdir(dss_path):
    makedirs(dss_path)

for current_legislature in legislatures:
    current_plenary = 1
    consecutive_errors = 0
    while consecutive_errors < max_errors:
        url = diary_session_senate_url(current_legislature, current_plenary)
        file = f"dss-{current_legislature}-{current_plenary:03d}"
        if download(url, dss_path, file):
            consecutive_errors = 0
        else:
            consecutive_errors += 1

        current_plenary += 1

## 2.- Convertir pagines de pdf a imatges

In [93]:
left_margin = 175
top_header_margin = 790
top_regular_margin = 296
right_margin = 1480
bottom_margin = 2160

dscd_files = [f'{dscd_path}/{file}' for file in listdir(dscd_path)]
dss_files = [f'{dss_path}/{file}' for file in listdir(dss_path)]
paths = dscd_files + dss_files

for path in paths:
    folder = f'./.data/images/{basename(path)[:-4]}'
    if not isdir(folder):
        makedirs(folder)
    else:
        continue
    pages = convert_from_path(path)
    page_index = 0
    for page in pages:
        top_margin = top_header_margin if page_index == 0 else top_regular_margin
        page\
            .crop((left_margin, top_margin, right_margin, bottom_margin))\
            .save(f'{folder}/{basename(path)[:-4]}-{page_index:03d}.jpeg', 'JPEG')
        page_index += 1

## 3.- Convertir imatges a text

In [None]:
cmd_docker_build = 'docker build -t tfg-ocr $(pwd)/tesseractOcr'
cmd_docker_run = 'docker run -v $(pwd)/.data/images:/home/images -v $(pwd)/.data/texts:/home/texts tfg-ocr'

subprocess.call(cmd_docker_build, shell=True)
subprocess.call(cmd_docker_run, shell=True)

Sending build context to Docker daemon   5.12kB
Step 1/9 : FROM python:3.10.2-alpine
 ---> 69fba17b9bae
Step 2/9 : WORKDIR /usr/src/app
 ---> Using cache
 ---> 8e736f63bd8b
Step 3/9 : RUN apk update
 ---> Using cache
 ---> d08a53d80c74
Step 4/9 : RUN /usr/local/bin/python -m pip install --upgrade pip
 ---> Using cache
 ---> a4b66a3d1342
Step 5/9 : RUN apk add --update tesseract-ocr tesseract-ocr-data-cat tesseract-ocr-data-spa jpeg-dev zlib-dev libjpeg gcc musl-dev poppler poppler-utils
 ---> Using cache
 ---> c45b13520167
Step 6/9 : COPY requirements.txt ./
 ---> Using cache
 ---> 732a484d649b
Step 7/9 : RUN pip install --no-cache-dir -r requirements.txt
 ---> Using cache
 ---> b3469ec27e4e
Step 8/9 : COPY main.py ./
 ---> f1d983076fed
Step 9/9 : CMD [ "python", "-u", "main.py" ]
 ---> Running in b1dc9c296845
Removing intermediate container b1dc9c296845
 ---> e69863026f8e
Successfully built e69863026f8e
Successfully tagged tfg-ocr:latest
Convert /home/images/dss-12-401/dss-12-401-048

In [None]:
all_texts_path = './.data/texts'

text_paths = listdir(all_texts_path)
for text_path in text_paths:
    if isfile(f'{all_texts_path}/{text_path}'):
        continue
    full_text_path = f'{all_texts_path}/full_{text_path}.txt'
    if isfile(full_text_path):
        continue

    text_pages_paths = sorted(listdir(f'{all_texts_path}/{text_path}'))
    with open(full_text_path, 'w') as full_text_file:
        for text_page_path in text_pages_paths:
            with open(f'{all_texts_path}/{text_path}/{text_page_path}') as page_text_file:
                full_text_file.write(page_text_file.read())

## 4.- Obtenir diputats

In [None]:
def get_deputies(legislature):
    url_deputies = "https://www.congreso.es/busqueda-de-diputados?p_p_id=diputadomodule&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchDiputados&p_p_cacheability=cacheLevelPage"

    payload={'_diputadomodule_idLegislatura': legislature,
             '_diputadomodule_genero': '0',
             '_diputadomodule_grupo': 'all',
             '_diputadomodule_tipo': '2',
             '_diputadomodule_nombre': '',
             '_diputadomodule_apellidos': '',
             '_diputadomodule_formacion': 'all',
             '_diputadomodule_filtroProvincias': '[]',
             '_diputadomodule_nombreCircunscripcion': ''}

    response = requests.request("POST", url_deputies, data=payload)
    return response.text

In [None]:
pkl_path = './.data/deputies.pkl'
if isfile(pkl_path):
    print('from file')
    dp = pd.read_pickle(pkl_path)
else:
    print('download')
    df = None
    for current_legislature in legislatures:
        data = get_deputies(current_legislature)
        data_json = json.loads(data)
        if df is None:
            df = json_normalize(data_json['data'])
        else:
            new_df = json_normalize(data_json['data'])
            for index, row in new_df.iterrows():
                current_row_df = df[(df.apellidosNombre == row['apellidosNombre']) & (df.formacion == row['formacion'])]
                if current_row_df.empty:
                    df = pd.concat([df, row])
                else:
                    df.loc[(df.apellidosNombre == row['apellidosNombre']) & (df.formacion == row['formacion']), 'fchBaja'] = row['fchBaja']
                    df.loc[(df.apellidosNombre == row['apellidosNombre']) & (df.formacion == row['formacion']), 'idLegislatura'] = f'{current_row_df.iloc[0].idLegislatura},{row["idLegislatura"]}'

    df.to_pickle('./.data/deputies.pkl')