## Notebook with the code required to download all approved CONPES documents

In [None]:
## Importing libraries
from google.colab import drive
import requests as r
import pandas as pd
import numpy as np
import json
import os

In [None]:
## Mounting the Google Drive
drive.mount('/content/drive')
outdir = 'drive/MyDrive/Team_19-DS4A-Project/CONPES_Dataset/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
## Obtaining cookies from browsing
url_website = 'https://sisconpes.dnp.gov.co/SisCONPESWeb/#documentos_conpes'
browsing = r.get(url_website)
cookies = browsing.cookies.get_dict()

In [None]:
## Data required for the download
url_documents = 'https://sisconpes.dnp.gov.co/SisCONPESWeb/AccesoPublico/BuscarCONPES'
verification_cookie = [(k,v) for k,v in cookies.items()]
columns_included = '&titulo=&numero=&fechaAprobacion1=&fechaAprobacion2='

In [None]:
## Building the request URL and performing POST request
url_request = url_documents + '?' + verification_cookie[0][0] + '=' + verification_cookie[0][1] + columns_included
raw_request = r.post(url_request)

In [None]:
## Obtaining raw request text
dict_request = json.loads(raw_request.text)
consolidate_conpes_docs = pd.DataFrame(dict_request['rows'])
consolidate_conpes_docs.head()

Unnamed: 0,IdRelacion,IdDocumento,IdDocumentoCONPES,IdTipoRelacion,Titulo,Numero,FechaAprobacion,URL,URLPAS,URLOtros,Orden,fechaPlazo
0,0,0,4894,0,Fortalecimiento del Uso y la Institucionalidad...,4083,16/05/2022,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,,0,
1,0,0,4901,0,"La Mojana: Territorio Resiliente, Sostenible, ...",4084,16/05/2022,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,,0,
2,0,0,4887,0,Declaración de Importancia Estratégica de los ...,4079,18/04/2022,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,,0,
3,0,0,4890,0,Concepto Favorable a la Nación para contratar ...,4081,18/04/2022,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,,,0,
4,0,0,4891,0,Declaración de Importancia Estratégica del Pro...,4082,18/04/2022,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,,0,


In [None]:
## Formating the final table of CONPES documents ready for download

# Replacing empty strings with NaN
consolidate_conpes_docs.replace(r'^\s*$', np.nan, regex=True, inplace=True)
consolidate_conpes_docs['FechaAprobacion'] = pd.to_datetime(consolidate_conpes_docs['FechaAprobacion'], format='%d/%m/%Y')

# Extracting only useful columns
final_conpes_docs = consolidate_conpes_docs[['IdDocumentoCONPES','Titulo', 'Numero', 'FechaAprobacion', 'URL', 'URLPAS', 'URLOtros']]
final_conpes_docs.to_csv(outdir + 'approved_CONPES_documents.csv', header=True, index=False)
final_conpes_docs.head()

Unnamed: 0,IdDocumentoCONPES,Titulo,Numero,FechaAprobacion,URL,URLPAS,URLOtros
0,4894,Fortalecimiento del Uso y la Institucionalidad...,4083,2022-05-16,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,
1,4901,"La Mojana: Territorio Resiliente, Sostenible, ...",4084,2022-05-16,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,
2,4887,Declaración de Importancia Estratégica de los ...,4079,2022-04-18,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,
3,4890,Concepto Favorable a la Nación para contratar ...,4081,2022-04-18,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,,
4,4891,Declaración de Importancia Estratégica del Pro...,4082,2022-04-18,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,https://colaboracion.dnp.gov.co/CDT/Conpes/Eco...,


In [None]:
## Sampling the CONPES from the last 5 years and downloading the PDFs
sample_docs = final_conpes_docs[final_conpes_docs['FechaAprobacion'] >= '2017-01-01']

In [None]:
## Downloading the PDF documents and saving them

#Creating the function
def make_document_request_and_save_content(row):
  file_name = outdir + 'PDF/CONPES_' + str(row['Numero']) + '.pdf'
  url = row['URL']
  doc_req = r.get(url, verify=False)

  with open(file_name, 'wb') as f:
    f.write(doc_req.content)
  
  print('Saved file!')

In [None]:
#Executing the task
sample_docs.apply(make_document_request_and_save_content, axis = 1)