In [None]:
from downloadDiaryOfSessions.main import download
from rabbitmqSender.main import RabbitSender
from diaryParser.main import  parse_diary

import json
from os.path import isdir, isfile
from bs4 import BeautifulSoup
from os import listdir, makedirs, mkdir
import pandas as pd
from pandas import json_normalize
import requests
from unidecode import unidecode

legislatures = [11, 12, 13, 14]
max_errors = 50
data_folder = './.data/'
dscd_path = f'{data_folder}diary_session_congress_deputies_pdfs'
dss_path = f'{data_folder}diary_session_senate_pdfs'
texts_folder_path = f'{data_folder}texts'
speeches_folder_path = f'{data_folder}speeches'
open_data_senators_xml = 'https://www.senado.es/web/ficopendataservlet?tipoFich=10#'
open_data_groups_xml = 'https://www.senado.es/web/ficopendataservlet?tipoFich=4&legis=13#'
senators_xml_path = './.data/opendata_senators.xml'
groups_xml_path = './.data/opendata_group.xml'
senate_pkl_path = './.data/senators.pkl'
deputies_pkl_path = './.data/deputies.pkl'

## 1.- Descarregar diaris de sessions

In [None]:
def diary_session_congress_deputies_url(legislature, plenary):
    return f"https://www.congreso.es/public_oficiales/L{legislature}/CONG/DS/PL/DSCD-{legislature}-PL-{plenary}.PDF"


def diary_session_senate_url(legislature, plenary):
    return f"https://www.senado.es/legis{legislature}/publicaciones/pdf/senado/ds/DS_C_{legislature}_{plenary}.PDF"


def download_diary_session(local_path, prefix, diary_session_url):
    if not isdir(local_path):
        makedirs(local_path)

    for current_legislature in legislatures:
        current_plenary = 1
        consecutive_errors = 0
        while consecutive_errors < max_errors:
            url = diary_session_url(current_legislature, current_plenary)
            file = f"{prefix}-{current_legislature}-{current_plenary:03d}"
            if download(url, local_path, file):
                consecutive_errors = 0
            else:
                consecutive_errors += 1

            current_plenary += 1


### 1.1.- Diaris de sessions del congress de diputats

In [None]:
download_diary_session(dscd_path, 'dscd', diary_session_congress_deputies_url)

### 1.2.- Diaris de sessions del senat

In [None]:
download_diary_session(dss_path, 'dss', diary_session_senate_url)

## 2.- Convertir pagines de pdf a imatges

`docker-compose -f docker-compose-pdf-to-img.yml -p tfg-pdf-to-img up -d`

In [None]:
sender = RabbitSender('localhost', 5672, 'myuser', 'mypassword', 'pdf-to-img')

for file in listdir(dscd_path):
    sender.send(f'{dscd_path}/{file}'[len(data_folder):])

for file in listdir(dss_path):
    sender.send(f'{dss_path}/{file}'[len(data_folder):])

sender.close()

[RabbitMQ queues management](http://localhost:15672/#/queues)

## 3.- Convertir imatges a text

`docker-compose -f docker-compose-img-to-txt.yml -p tfg-img-to-txt up -d`

In [None]:
sender = RabbitSender('localhost', 5672, 'myuser', 'mypassword', 'img-to-txt')

for file in listdir(f'{data_folder}images'):
    sender.send(file)

sender.close()

## 4.- Obtenir diputats

In [None]:
def get_deputies(legislature):
    url_deputies = "https://www.congreso.es/busqueda-de-diputados?p_p_id=diputadomodule&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=searchDiputados&p_p_cacheability=cacheLevelPage"

    payload={'_diputadomodule_idLegislatura': legislature,
             '_diputadomodule_genero': '0',
             '_diputadomodule_grupo': 'all',
             '_diputadomodule_tipo': '2',
             '_diputadomodule_nombre': '',
             '_diputadomodule_apellidos': '',
             '_diputadomodule_formacion': 'all',
             '_diputadomodule_filtroProvincias': '[]',
             '_diputadomodule_nombreCircunscripcion': ''}

    response = requests.request("POST", url_deputies, data=payload)
    return response.text

In [None]:
df_deputies = None
for current_legislature in legislatures:
    data = get_deputies(current_legislature)
    data_json = json.loads(data)
    if df_deputies is None:
        df_deputies = json_normalize(data_json['data'])
    else:
        new_df = json_normalize(data_json['data'])
        for index, row in new_df.iterrows():
            current_row_df = df_deputies[(df_deputies.apellidosNombre == row['apellidosNombre']) & (df_deputies.formacion == row['formacion'])]
            if current_row_df.empty:
                df_deputies = pd.concat([df_deputies, row])
            else:
                df_deputies.loc[(df_deputies.apellidosNombre == row['apellidosNombre']) & (df_deputies.formacion == row['formacion']), 'fchBaja'] = row['fchBaja']
                df_deputies.loc[(df_deputies.apellidosNombre == row['apellidosNombre']) & (df_deputies.formacion == row['formacion']), 'idLegislatura'] = f'{current_row_df.iloc[0].idLegislatura},{row["idLegislatura"]}'

df_deputies['apellidos'] = df_deputies['apellidos'].str.strip()
df_deputies.to_pickle(deputies_pkl_path)
df_deputies

## 5.- Obtenir senadors

In [None]:
senators_response = requests.get(open_data_senators_xml)
with open(senators_xml_path, 'w', encoding='utf-8') as senators_file:
    senators_file.write(senators_response.text)

groups_response = requests.get(open_data_groups_xml)
groups_data = BeautifulSoup(groups_response.text, 'xml')
data_headers = groups_data.findAll('datosCabecera')

groups = []
for header in data_headers:
    groups.append({
        'code': header.find('codigo').text,
        'fullname': header.find('nombre').text,
        'acronym': header.find('siglas').text
    })

with open(groups_xml_path, 'w', encoding='utf-8') as groups_file:
    groups_file.write(groups_response.text)

df_senators = pd.read_xml(senators_xml_path)
df_senators = df_senators[df_senators['legislatura'].isin(legislatures)]

df_groups = pd.DataFrame(groups)

df_senate = pd.merge(df_senators, df_groups, how='left', left_on=['grupoSiglas'], right_on=['code'])
df_senate.to_pickle(senate_pkl_path)
df_senate

## 6.- Obtenir discursos

In [None]:
if not isdir(speeches_folder_path):
    mkdir(speeches_folder_path)

text_paths = listdir(texts_folder_path)
for text_path in text_paths:
    path = f'{texts_folder_path}/{text_path}'
    parts = text_path[:-4].split('-')

    with open(path, 'r', encoding="utf-8") as file:
        text = file.read()
        try:
            speeches = parse_diary(text, parts[0], int(parts[1]), int(parts[2]))
            for speech in speeches:
                speech_path = f'{speeches_folder_path}/{text_path[:-4]}-{speech["order"]:03d}.json'
                json_speech = json.dumps(speech, indent=4, ensure_ascii=False)
                with open(speech_path, 'w', encoding='utf-8') as speech_file:
                    speech_file.write(json_speech)
        except IndexError:
            print('error at parse', text_path)
            continue

### 6.1.- Incloure informació personal

In [None]:
df_senators = pd.read_pickle(senate_pkl_path)
df_senators['apellidos_unidecode'] = df_senators['apellidos'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df_deputies = pd.read_pickle(deputies_pkl_path)
df_deputies['apellidos_unidecode'] = df_deputies['apellidos'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')


def find_personal_info(speech_info, df):
    if 'surname' not in speech_info:
        return None
    surname = speech_info['surname'].strip()

    if surname == 'PRESIDENTE' or surname == 'PRESIDENTA':
        fullname = speech_info['presidency'].strip()
        found_president = df.loc[df['nombre'].str.upper() + ' ' + df['apellidos'].str.upper() == fullname.upper()]
        if found_president.shape[0] == 0:
            return None
        return found_president.iloc[0].to_dict()
    surname_with_explanatory = [
        'SECRETARIO',
        'SECRETARIA',
        'VICEPRESIDENTA',
        'VICEPRESIDENTE',
        'PRESIDENTE DEL GOBIERNO EN FUNCIONES',
        'PRESIDENTA DEL GOBIERNO EN FUNCIONES',
        'DEFENSORA DEL PUEBLO',
        'DEFENSOR DEL PUEBLO',
        'MINISTRO DEL INTERIOR EN FUNCIONES',
        'MINISTRA DEL INTERIOR EN FUNCIONES'
    ]
    if surname in surname_with_explanatory and 'explanatory' in speech_info:
        surname = speech_info['explanatory'].strip()

    found = df.loc[df['apellidos'].str.upper() == surname.upper()]
    if found.shape[0] == 0:
        surname_unidecode = unidecode(surname.upper())
        found = df.loc[df['apellidos_unidecode'].str.upper() == surname_unidecode]
        if found.shape[0] == 0:
            return None
    return found.iloc[0].to_dict()


def parse_dscd(speech_file_path):
    path = f'{speeches_folder_path}/{speech_file_path}'
    with open(path, 'r', encoding='utf8') as file:
        speech = json.loads(file.read())
        personal_info = find_personal_info(speech, df_deputies)
        if personal_info is not None:
            speech['name'] = personal_info['nombre']
            speech['surname'] = personal_info['apellidos']
            speech['group'] = personal_info['grupo']
            speech['acronym'] = personal_info['formacion']
            speech['gender'] = 'male' if personal_info['genero'] == 1 else 'female'

            json_speech = json.dumps(speech, indent=4, ensure_ascii=False)
            with open(path, 'w', encoding='utf-8') as speech_file:
                speech_file.write(json_speech)
        else:
            print(path, speech['surname'], 'Author not found')


def parse_dss(speech_file_path):
    path = f'{speeches_folder_path}/{speech_file_path}'
    with open(path, 'r', encoding='utf8') as file:
        speech = json.loads(file.read())
        personal_info = find_personal_info(speech, df_senators)
        if personal_info is not None:
            print(personal_info)
            speech['name'] = personal_info['nombre']
            speech['surname'] = personal_info['apellidos']
            speech['group'] = personal_info['fullname']
            speech['acronym'] = personal_info['acronym']

            json_speech = json.dumps(speech, indent=4, ensure_ascii=False)
            with open(path, 'w', encoding='utf-8') as speech_file:
                speech_file.write(json_speech)
        else:
            print(path, speech['surname'], 'Author not found')


speeches_paths = listdir(speeches_folder_path)
for speech_path in speeches_paths:
    if speech_path.startswith('dss'):
        parse_dss(speech_path)
    else:
        parse_dscd(speech_path)