In [None]:
from downloadDiaryOfSessions.main import download
from rabbitmqSender.main import RabbitSender
from googleSync.main import sync_folder


import json
from os.path import isdir, isfile
from os import listdir, makedirs, mkdir
import pandas as pd
from pandas import json_normalize
import requests

In [None]:
sync_folder()

## 1.- Descarregar diaris de sessions

In [None]:
legislatures = [11, 12, 13, 14]
max_errors = 50
data_folder = './.data/'
dscd_path = f'{data_folder}diary_session_congress_deputies_pdfs'
dss_path = f'{data_folder}diary_session_senate_pdfs'


def diary_session_congress_deputies_url(legislature, plenary):
    return f"https://www.congreso.es/public_oficiales/L{legislature}/CONG/DS/PL/DSCD-{legislature}-PL-{plenary}.PDF"


def diary_session_senate_url(legislature, plenary):
    return f"https://www.senado.es/legis{legislature}/publicaciones/pdf/senado/ds/DS_C_{legislature}_{plenary}.PDF"


def download_diary_session(local_path, prefix, diary_session_url):
    if not isdir(local_path):
        makedirs(local_path)

    for current_legislature in legislatures:
        current_plenary = 1
        consecutive_errors = 0
        while consecutive_errors < max_errors:
            url = diary_session_url(current_legislature, current_plenary)
            file = f"{prefix}-{current_legislature}-{current_plenary:03d}"
            if download(url, local_path, file):
                consecutive_errors = 0
            else:
                consecutive_errors += 1

            current_plenary += 1


### 1.1.- Diaris de sessions del congress de diputats

In [None]:
download_diary_session(dscd_path, 'dscd', diary_session_congress_deputies_url)

### 1.2.- Diaris de sessions del senat

In [None]:
download_diary_session(dss_path, 'dss', diary_session_senate_url)

## 2.- Convertir pagines de pdf a imatges

`docker-compose -f docker-compose-pdf-to-img.yml -p tfg-pdf-to-img up -d`

In [None]:
sender = RabbitSender('localhost', 5672, 'myuser', 'mypassword', 'pdf-to-img')

for file in listdir(dscd_path):
    sender.send(f'{dscd_path}/{file}'[len(data_folder):])

for file in listdir(dss_path):
    sender.send(f'{dss_path}/{file}'[len(data_folder):])

sender.close()

[RabbitMQ queues management](http://localhost:15672/#/queues)

## 3.- Convertir imatges a text

`docker-compose -f docker-compose-img-to-txt.yml -p tfg-img-to-txt up -d`

In [None]:
sender = RabbitSender('localhost', 5672, 'myuser', 'mypassword', 'img-to-txt')

for file in listdir(f'{data_folder}images'):
    sender.send(file)

sender.close()

## 4.- Obtenir diputats

In [None]:
def get_deputies(legislature):
    url_deputies = "https://www.congreso.es/busqueda-de-diputados?p_p_id=diputadomodule&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchDiputados&p_p_cacheability=cacheLevelPage"

    payload={'_diputadomodule_idLegislatura': legislature,
             '_diputadomodule_genero': '0',
             '_diputadomodule_grupo': 'all',
             '_diputadomodule_tipo': '2',
             '_diputadomodule_nombre': '',
             '_diputadomodule_apellidos': '',
             '_diputadomodule_formacion': 'all',
             '_diputadomodule_filtroProvincias': '[]',
             '_diputadomodule_nombreCircunscripcion': ''}

    response = requests.request("POST", url_deputies, data=payload)
    return response.text


pkl_path = './.data/deputies.pkl'
if isfile(pkl_path):
    print('from file')
    df_deputies = pd.read_pickle(pkl_path)
else:
    print('download')
    df_deputies = None
    for current_legislature in legislatures:
        data = get_deputies(current_legislature)
        data_json = json.loads(data)
        if df_deputies is None:
            df_deputies = json_normalize(data_json['data'])
        else:
            new_df = json_normalize(data_json['data'])
            for index, row in new_df.iterrows():
                current_row_df = df_deputies[(df_deputies.apellidosNombre == row['apellidosNombre']) & (df_deputies.formacion == row['formacion'])]
                if current_row_df.empty:
                    df_deputies = pd.concat([df_deputies, row])
                else:
                    df_deputies.loc[(df_deputies.apellidosNombre == row['apellidosNombre']) & (df_deputies.formacion == row['formacion']), 'fchBaja'] = row['fchBaja']
                    df_deputies.loc[(df_deputies.apellidosNombre == row['apellidosNombre']) & (df_deputies.formacion == row['formacion']), 'idLegislatura'] = f'{current_row_df.iloc[0].idLegislatura},{row["idLegislatura"]}'

    df_deputies['apellidos'] = df_deputies['apellidos'].str.strip()
    df_deputies.to_pickle('./.data/deputies.pkl')

df_deputies

## 5.- Obtenir discursos

In [None]:
from diaryParser.main import  parse_diary

texts_folder_path = './.data/texts'
speeches_folder_path = './.data/speeches'

if not isdir(speeches_folder_path):
    mkdir(speeches_folder_path)

text_paths = listdir(texts_folder_path)
for text_path in text_paths:
    path = f'{texts_folder_path}/{text_path}'
    parts = text_path[:-4].split('-')

    with open(path, 'r', encoding="utf-8") as file:
        text = file.read()
        try:
            speeches = parse_diary(text, parts[0], int(parts[1]), int(parts[2]))
            for speech in speeches:
                speech_path = f'{speeches_folder_path}/{text_path[:-4]}-{speech["order"]:03d}.json'
                json_speech = json.dumps(speech, indent=4, ensure_ascii=False)
                with open(speech_path, 'w', encoding='utf-8') as speech_file:
                    speech_file.write(json_speech)
        except IndexError:
            print('error at parse', text_path)
            continue

### 5.1.- Incloure informació personal

In [None]:
def find_personal_info(speech_info):
    if 'surname' not in speech_info:
        return None
    name = speech_info['surname'].strip()
    found = df_deputies.loc[df_deputies['apellidos'].str.upper() == name]
    if found.shape[0] == 0:
        return None
    return found.iloc[0].to_dict()


def parse_date(date):
    if date == '' or date is None:
        return None
    date_parts = date.split('/')
    return f'{date_parts[2]}-{date_parts[1]}-{date_parts[0]}'


def parse_legislatures(legislatures):
    if isinstance(legislatures, int):
        return [legislatures]

    return [int(l) for l in personal_info['idLegislatura'].split(',')]


speeches_paths = listdir(speeches_folder_path)
for speech_path in speeches_paths:
    if not speech_path.startswith('dscd'):
        continue
    path = f'{speeches_folder_path}/{speech_path}'
    with open(path, 'r', encoding='utf8') as file:
        speech = json.loads(file.read())
        personal_info = find_personal_info(speech)
        if personal_info is not None:
            speech['name'] = personal_info['nombre']
            speech['surname'] = personal_info['apellidos']
            speech['group'] = personal_info['grupo']
            speech['formation'] = personal_info['formacion']
            speech['circumscription'] = personal_info['nombreCircunscripcion']
            speech['entry_date'] = parse_date(personal_info['fchAlta'])
            speech['leaving_date'] = parse_date(personal_info['fchBaja'])
            speech['gender'] = 'male' if personal_info['genero'] == 1 else 'female'
            speech['legislatures'] = parse_legislatures(personal_info['idLegislatura'])

            json_speech = json.dumps(speech, indent=4, ensure_ascii=False)
            with open(path, 'w', encoding='utf-8') as speech_file:
                speech_file.write(json_speech)