In [None]:
import re

from downloadDiaryOfSessions.main import download
from rabbitmqSender.main import RabbitSender
from diaryParser.main import  parse_diary

from json import loads as json_loads, dumps as json_dumps
from os.path import isdir
from bs4 import BeautifulSoup
from os import listdir, makedirs, mkdir
import pandas as pd
from pandas import json_normalize
import requests
from unidecode import unidecode

legislatures = [11, 12, 13, 14]
max_errors = 50
data_folder = './.data/'
dscd_path = f'{data_folder}diary_session_congress_deputies_pdfs'
dss_path = f'{data_folder}diary_session_senate_pdfs'
texts_folder_path = f'{data_folder}texts'
speeches_folder_path = f'{data_folder}speeches'
speeches_to_index_folder_path = f'{data_folder}speeches_to_index'
open_data_senators_xml = 'https://www.senado.es/web/ficopendataservlet?tipoFich=10#'
open_data_groups_xml = 'https://www.senado.es/web/ficopendataservlet?tipoFich=4&legis=13#'
senators_xml_path = f'{data_folder}opendata_senators.xml'
groups_xml_path = f'{data_folder}opendata_group.xml'
people_senate_csv_path = f'{data_folder}people_senate.csv'
people_congress_csv_path = f'{data_folder}people_congress.csv'
people_csv_path = f'{data_folder}people.csv'

## 1.- Descarregar diaris de sessions

In [None]:
def diary_session_congress_deputies_url(legislature, plenary):
    return f"https://www.congreso.es/public_oficiales/L{legislature}/CONG/DS/PL/DSCD-{legislature}-PL-{plenary}.PDF"


def diary_session_senate_url(legislature, plenary):
    return f"https://www.senado.es/legis{legislature}/publicaciones/pdf/senado/ds/DS_C_{legislature}_{plenary}.PDF"


def download_diary_session(local_path, prefix, diary_session_url):
    if not isdir(local_path):
        makedirs(local_path)

    for current_legislature in legislatures:
        current_plenary = 1
        consecutive_errors = 0
        while consecutive_errors < max_errors:
            url = diary_session_url(current_legislature, current_plenary)
            file = f"{prefix}-{current_legislature}-{current_plenary:03d}"
            if download(url, local_path, file):
                consecutive_errors = 0
            else:
                consecutive_errors += 1

            current_plenary += 1


### 1.1.- Diaris de sessions del congress de diputats

In [None]:
download_diary_session(dscd_path, 'dscd', diary_session_congress_deputies_url)

### 1.2.- Diaris de sessions del senat

In [None]:
download_diary_session(dss_path, 'dss', diary_session_senate_url)

## 2.- Convertir pagines de pdf a imatges

`docker-compose -f docker-compose-pdf-to-img.yml -p tfg-pdf-to-img up -d`

In [None]:
sender = RabbitSender('localhost', 5672, 'myuser', 'mypassword', 'pdf-to-img')

for file in listdir(dscd_path):
    sender.send(f'{dscd_path}/{file}'[len(data_folder):])

for file in listdir(dss_path):
    sender.send(f'{dss_path}/{file}'[len(data_folder):])

sender.close()

[RabbitMQ queues management](http://localhost:15672/#/queues)

## 3.- Convertir imatges a text

`docker-compose -f docker-compose-img-to-txt.yml -p tfg-img-to-txt up -d`

In [None]:
sender = RabbitSender('localhost', 5672, 'myuser', 'mypassword', 'img-to-txt')

for file in listdir(f'{data_folder}images'):
    sender.send(file)

sender.close()

## 4.- Obtenir diputats

In [None]:
def get_deputies(legislature):
    url_deputies = "https://www.congreso.es/busqueda-de-diputados?p_p_id=diputadomodule&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchDiputados&p_p_cacheability=cacheLevelPage"

    payload={'_diputadomodule_idLegislatura': legislature,
             '_diputadomodule_genero': '0',
             '_diputadomodule_grupo': 'all',
             '_diputadomodule_tipo': '2',
             '_diputadomodule_nombre': '',
             '_diputadomodule_apellidos': '',
             '_diputadomodule_formacion': 'all',
             '_diputadomodule_filtroProvincias': '[]',
             '_diputadomodule_nombreCircunscripcion': ''}

    response = requests.request("POST", url_deputies, data=payload)
    return response.text

df_deputies = None
for current_legislature in legislatures:
    json = get_deputies(11)
    data = json_loads(json)
    df = json_normalize(data['data'])
    df_deputies = df if df_deputies is None else pd.concat([df_deputies, df])

df_deputies = df_deputies.sort_values('apellidos').drop_duplicates(subset=['nombre', 'genero', 'grupo', 'formacion'])
df_deputies.to_csv(people_congress_csv_path, index=False)
df_deputies

## 5.- Obtenir senadors

In [None]:
senators_response = requests.get(open_data_senators_xml)
with open(senators_xml_path, 'w', encoding='utf-8') as senators_file:
    senators_file.write(senators_response.text)

groups_response = requests.get(open_data_groups_xml)
groups_data = BeautifulSoup(groups_response.text, 'xml')
data_headers = groups_data.findAll('datosCabecera')

groups = []
for header in data_headers:
    groups.append({
        'code': header.find('codigo').text,
        'fullname': header.find('nombre').text,
        'acronym': header.find('siglas').text
    })

with open(groups_xml_path, 'w', encoding='utf-8') as groups_file:
    groups_file.write(groups_response.text)

df_senators = pd.read_xml(senators_xml_path)
df_senators = df_senators[df_senators['legislatura'].isin(legislatures)]

df_groups = pd.DataFrame(groups)

df_senate = pd.merge(df_senators, df_groups, how='left', left_on=['grupoSiglas'], right_on=['code'])
df_senate.to_csv(people_senate_csv_path, index=False)
df_senate

## 6.- Obtenir discursos

In [None]:
if not isdir(speeches_folder_path):
    mkdir(speeches_folder_path)

text_paths = listdir(texts_folder_path)
for text_path in text_paths:
    path = f'{texts_folder_path}/{text_path}'
    parts = text_path[:-4].split('-')

    with open(path, 'r', encoding="utf-8") as file:
        text = file.read()
        try:
            speeches = parse_diary(text, parts[0], int(parts[1]), int(parts[2]))
            for speech in speeches:
                speech_path = f'{speeches_folder_path}/{text_path[:-4]}-{speech["order"]:03d}.json'
                json_speech = json_dumps(speech, indent=4, ensure_ascii=False)
                with open(speech_path, 'w', encoding='utf-8') as speech_file:
                    speech_file.write(json_speech)
        except IndexError:
            print('error at parse', text_path)
            continue

### 6.1.- Incloure informació personal

In [None]:
#https://www.lamoncloa.gob.es/gobierno/gobiernosporlegislaturas/Paginas/xiv_legislatura.aspx

df_senators = pd.read_csv(people_senate_csv_path)
df_senators['unidecode_name'] = df_senators['nombre'].str.replace("-", " ").str.upper().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_senators['unidecode_surname'] = df_senators['apellidos'].str.replace("-", " ").str.upper().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

df_deputies = pd.read_csv(people_congress_csv_path)
df_deputies['unidecode_name'] = df_deputies['nombre'].str.replace("-", " ").str.upper().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_deputies['unidecode_surname'] = df_deputies['apellidos'].str.replace("-", " ").str.upper().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')


df_deputies['name'] = df_deputies['nombre'].str.upper()
df_deputies['surname'] = df_deputies['apellidos'].str.upper()
df_deputies['group'] = df_deputies['grupo'].str.upper()
df_deputies['acronym'] = df_deputies['formacion'].str.upper()

df_senators['name'] = df_senators['nombre'].str.upper()
df_senators['surname'] = df_senators['apellidos'].str.upper()
df_senators['group'] = df_senators['fullname'].str.upper()
df_senators['acronym'] = df_senators['acronym'].str.upper()

df_people = pd.concat([
    df_deputies[['name', 'unidecode_name', 'surname', 'unidecode_surname', 'group', 'acronym']],
    df_senators[['name', 'unidecode_name', 'surname', 'unidecode_surname', 'group', 'acronym']]],
    ignore_index=True)

df_people.to_csv(people_csv_path, index=False)

df_people

In [None]:
if not isdir(speeches_to_index_folder_path):
    makedirs(speeches_to_index_folder_path)

df_people = pd.read_csv()

def find_personal_info(speech_info):
    if 'surname' not in speech_info or speech_info['surname'] is None:
        return None

    surname = speech_info['surname'].strip().upper()
    surname = unidecode(surname).replace("-", " ")

    if surname == 'PRESIDENTE' or surname == 'PRESIDENTA':
        if 'presidency' not in speech_info or speech_info['presidency'] is None:
            return None

        fullname = speech_info['presidency'].strip().upper()
        fullname = unidecode(fullname)
        found_president = df_people.loc[df_people['unidecode_name'] + ' ' + df_people['unidecode_surname'] == fullname]
        if found_president.shape[0] == 0:
            return None
        return found_president.iloc[0].to_dict()

    surname_with_explanatory = [
        'SECRETARIO',
        'SECRETARIA',
        'VICEPRESIDENTA',
        'VICEPRESIDENTE',
        'PRESIDENTE DEL GOBIERNO EN FUNCIONES',
        'PRESIDENTA DEL GOBIERNO EN FUNCIONES',
        'DEFENSORA DEL PUEBLO',
        'DEFENSOR DEL PUEBLO',
        'MINISTRO DEL INTERIOR EN FUNCIONES',
        'MINISTRA DEL INTERIOR EN FUNCIONES'
    ]
    if surname in surname_with_explanatory and 'explanatory' in speech_info:
        surname = speech_info['explanatory'].strip().upper()
        surname = unidecode(surname).replace("-", " ")

    found = df_people.loc[df_people['unidecode_surname'] == surname]
    if found.shape[0] == 0:
        surname = re.sub(r"^DE LOS ", "", surname)
        surname = re.sub(r"^DE LAS ", "", surname)
        surname = re.sub(r"^DE LA ", "", surname)
        surname = re.sub(r"^DEL ", "", surname)
        surname = re.sub(r"^DE ", "", surname)
        found = df_people.loc[df_people['unidecode_surname'] == surname]
        if found.shape[0] == 0:
            return None
    return found.iloc[0].to_dict()


def parse_speech(speech_file_name):
    raw_path = f'{speeches_folder_path}/{speech_file_name}'
    out_path = f'{speeches_to_index_folder_path}/{speech_file_name}'

    with open(raw_path, 'r', encoding='utf-8') as raw_speech_file:
        speech = json_loads(raw_speech_file.read())
        personal_info = find_personal_info(speech)
        if personal_info is None:
            print(raw_path, speech['surname'] if 'surname' in speech else 'NULL', 'Author not found')
            return
        speech['name'] = personal_info['name']
        speech['surname'] = personal_info['surname']
        speech['group'] = personal_info['group']
        speech['acronym'] = personal_info['acronym']

        with open(out_path, 'w', encoding='utf-8') as out_speech_file:
            out_speech_file.write(json_dumps(speech, indent=4, ensure_ascii=False))


#parse_speech('dss-14-404-044.json')

speeches_paths = listdir(speeches_folder_path)
for speech_path in speeches_paths:
    parse_speech(speech_path)