In [1]:
import os
import cv2
import requests
import numpy as np
import pandas as pd
from IPython import display
from urllib.parse import unquote
from pdf2image import convert_from_path

In [2]:
content = requests.get('https://circular-ai.glitch.me/resources')
data = content.json()
df = pd.DataFrame(data["data"])
df.head()

Unnamed: 0,_id,pageNumber,reference,document,title,timestamp,__v
0,6623ba0dfaf3e17e8b6016d6,1,TED/DIR/CON/GOM/001/073,https://cbn.gov.ng/Out/2024/CCD/Sales of FX to...,Sales of FX to BDCs to Meet Retail Market Dema...,4/8/2024,0
1,6623ba0dfaf3e17e8b6016d7,1,BSD/DIR/PUB/LAB/017/004,https://cbn.gov.ng/Out/2024/CCD/LETTER TO ALL ...,Letter to all Banks- The Use of Foreign- Curre...,4/8/2024,0
2,6623ba0dfaf3e17e8b6016d8,1,FPR/DIR/PUB/CIR/002/009,https://cbn.gov.ng/Out/2024/CCD/Recapitalizati...,Review of Minimum Capital Requirements for Com...,3/28/2024,0
3,6623ba0dfaf3e17e8b6016d9,1,CCD/PUB/BUL/Q1-2024,https://cbn.gov.ng/Out/2024/CCD/BULLION 2024 R...,CBN Bullion Volume 48 No.1 January - March 2024,3/27/2024,0
4,6623ba0dfaf3e17e8b6016da,1,CBN/MPC/COM/151/294,https://cbn.gov.ng/Out/2024/CCD/UPDATE DDG EP ...,Central Bank of Nigeria Communique No.151 of t...,3/26/2024,0


In [3]:
df.shape

(2000, 7)

## Data Cleaning

In [4]:
# filtering out rows whose titles contain 'vol', 'volume', or 'meeting'
mask = df['title'].str.lower().apply(
    lambda x: all(word not in x.split() for word in ['vol', 'volume', 'meeting'])
)
filtered_df = df[mask]

print(filtered_df.shape)

(1742, 7)


In [5]:
filtered_df.head()

Unnamed: 0,_id,pageNumber,reference,document,title,timestamp,__v
0,6623ba0dfaf3e17e8b6016d6,1,TED/DIR/CON/GOM/001/073,https://cbn.gov.ng/Out/2024/CCD/Sales of FX to...,Sales of FX to BDCs to Meet Retail Market Dema...,4/8/2024,0
1,6623ba0dfaf3e17e8b6016d7,1,BSD/DIR/PUB/LAB/017/004,https://cbn.gov.ng/Out/2024/CCD/LETTER TO ALL ...,Letter to all Banks- The Use of Foreign- Curre...,4/8/2024,0
2,6623ba0dfaf3e17e8b6016d8,1,FPR/DIR/PUB/CIR/002/009,https://cbn.gov.ng/Out/2024/CCD/Recapitalizati...,Review of Minimum Capital Requirements for Com...,3/28/2024,0
7,6623ba0dfaf3e17e8b6016dd,1,CCD/FAQ/01/16-03-2024,https://cbn.gov.ng/Out/2024/CCD/FAQs for ferti...,Partnership to Tackle Food Inflation: Fertili...,3/16/2024,0
8,6623ba0dfaf3e17e8b6016de,1,BSD/DIR/PUB/LAB/017/003,https://cbn.gov.ng/Out/2024/CCD/RE IMPACT OF R...,Re: Impact of Recent Policy Reforms-Prudential...,3/14/2024,0


In [6]:
filtered_df.isna().sum()

_id             0
pageNumber      0
reference       0
document        0
title           0
timestamp     140
__v             0
dtype: int64

In [7]:
filtered_df = filtered_df.dropna()

In [9]:
filtered_df['timestamp'] = pd.to_datetime(filtered_df['timestamp'])

filtered_df['year'] = filtered_df['timestamp'].dt.year

filtered_df = filtered_df.dropna()

filtered_df['year'] = filtered_df['year'].astype(int)

filtered_df['year'].value_counts()

2019    249
2020    201
2023    186
2017    179
2018    175
2021    162
2022    146
2016    136
2024     48
Name: year, dtype: int64

__Use only documents associated with 2023 and 2024__

In [10]:
filtered_df = filtered_df[filtered_df["year"].isin([2024, 2023])]

print(len(filtered_df))

234


## Download the PDFs

In [11]:
def download_pdf(pdf_url, download_dir):
    "Download PDF and return the file path"
    # Extract the filename from the URL
    filename = unquote(pdf_url.split("/")[-1])  # Decode URL-encoded characters
    # Construct the full file path
    file_path = os.path.join(download_dir, filename)
    
    # Download the PDF
    response = requests.get(pdf_url)
    with open(file_path, 'wb') as f:
        f.write(response.content)
    
    return file_path

# Define the directory to store downloaded PDFs
download_dir = "downloaded_pdfs"

# Create the directory if it doesn't exist
os.makedirs(download_dir, exist_ok=True)

# List to store the downloaded PDF paths
downloaded_pdf_paths = []

# Iterate over each row in the DataFrame
for i, row in filtered_df.iterrows():
    # Check if the URL ends with '.pdf'
    if row['document'].lower().endswith('.pdf'):
        # Download the PDF and get the file path
        pdf_path = download_pdf(row['document'], download_dir)
        # Append the file path to the list
        downloaded_pdf_paths.append(pdf_path)
    else:
        # If the URL doesn't end with '.pdf', append None
        downloaded_pdf_paths.append(None)

# Add the downloaded PDF paths as a new column to the DataFrame
filtered_df['downloaded_pdf_path'] = downloaded_pdf_paths

filtered_df

Unnamed: 0,_id,pageNumber,reference,document,title,timestamp,__v,year,downloaded_pdf_path
0,6623ba0dfaf3e17e8b6016d6,1,TED/DIR/CON/GOM/001/073,https://cbn.gov.ng/Out/2024/CCD/Sales of FX to...,Sales of FX to BDCs to Meet Retail Market Dema...,2024-04-08,0,2024,downloaded_pdfs/Sales of FX to BDCs to Meet Re...
1,6623ba0dfaf3e17e8b6016d7,1,BSD/DIR/PUB/LAB/017/004,https://cbn.gov.ng/Out/2024/CCD/LETTER TO ALL ...,Letter to all Banks- The Use of Foreign- Curre...,2024-04-08,0,2024,downloaded_pdfs/LETTER TO ALL BANKS- THE USE O...
2,6623ba0dfaf3e17e8b6016d8,1,FPR/DIR/PUB/CIR/002/009,https://cbn.gov.ng/Out/2024/CCD/Recapitalizati...,Review of Minimum Capital Requirements for Com...,2024-03-28,0,2024,downloaded_pdfs/Recapitalization_MARCH_2024.pdf
7,6623ba0dfaf3e17e8b6016dd,1,CCD/FAQ/01/16-03-2024,https://cbn.gov.ng/Out/2024/CCD/FAQs for ferti...,Partnership to Tackle Food Inflation: Fertili...,2024-03-16,0,2024,downloaded_pdfs/FAQs for fertilisers02.pdf
8,6623ba0dfaf3e17e8b6016de,1,BSD/DIR/PUB/LAB/017/003,https://cbn.gov.ng/Out/2024/CCD/RE IMPACT OF R...,Re: Impact of Recent Policy Reforms-Prudential...,2024-03-14,0,2024,downloaded_pdfs/RE IMPACT OF RECENT POLICY REF...
...,...,...,...,...,...,...,...,...,...
268,6623ba0dfaf3e17e8b6017e2,14,FPR/DIR/PUB/CIR/01/064,https://cbn.gov.ng/Out/2023/FPRD/Circular on G...,Guidance on Ultimate Beneficial Owners of Lega...,2023-01-13,0,2023,downloaded_pdfs/Circular on Guidance Ultimate ...
269,6623ba0dfaf3e17e8b6017e3,14,BSD/DIR/GEN/DLR/001/002,https://cbn.gov.ng/Out/2023/BSD/WEEKLY INTERES...,Deposit and Lending Rates in the Banking Indus...,2023-01-13,0,2023,downloaded_pdfs/WEEKLY INTEREST RATES AS AT JA...
270,6623ba0dfaf3e17e8b6017e4,14,OFI/DOA/CON/OFI/001/304,https://cbn.gov.ng/Out/2023/OFISD/Prohibition ...,Letter to all OFIs : Prohibition of Placement/...,2023-01-09,0,2023,downloaded_pdfs/Prohibition of Placements in F...
271,6623ba0dfaf3e17e8b6017e5,14,BSD/DIR/GEN/DLR/001/001,https://cbn.gov.ng/Out/2023/BSD/WEEKLY INTERES...,Deposit and Lending Rates in the Banking Indus...,2023-01-06,0,2023,downloaded_pdfs/WEEKLY INTEREST RATES AS AT JA...


In [12]:
print(len(filtered_df))

# drop all rows with None
clean_filtered_df = filtered_df[filtered_df["downloaded_pdf_path"]!=None]
print(len(clean_filtered_df))

234
234


### Convert the PDFs to PNG

In [13]:
from PIL import Image
import numpy as np
import cv2

def pdf_to_images(pdf_path, download_dir):
    # Convert PDF to images
    images = convert_from_path(pdf_path)
    
    try:
        # Attempt to concatenate images vertically
        stacked_image = np.concatenate([np.array(image) for image in images], axis=0)
        pdf_path = pdf_path.split("/")[1]
        output_path = pdf_path.split(".")[0] + ".png"
        file_path = os.path.join(download_dir, output_path)
        
        # Save concatenated image
        cv2.imwrite(file_path, cv2.cvtColor(stacked_image, cv2.COLOR_RGB2BGR))
    except:
        # print(f"Error encountered: {e}. Resizing images to a common width.")

        # # Find the maximum width
        # max_width = max(image.width for image in images)

        # # Resize images to the maximum width
        # resized_images = [image.resize((max_width, int(image.height * max_width / image.width)), Image.ANTIALIAS) for image in images]
        
        # # Convert images to numpy arrays for concatenation
        # images_np = [np.array(img) for img in resized_images]

        # # Concatenate images vertically again
        # stacked_image = np.concatenate(images_np, axis=0)

        file_path = None
    
    return file_path

# Define the directory to store downloaded PDFs
png_dir = "png_files"

# Create the directory if it doesn't exist
os.makedirs(png_dir, exist_ok=True)

downloaded_png_paths = []
# Iterate over each row in the DataFrame
for i, row in clean_filtered_df.iterrows():
    try:
        # Check if the URL ends with '.pdf'
        if row['downloaded_pdf_path'].lower().endswith('.pdf'):
            # Use the PDF path to convert to image and get the image path
            png_path = pdf_to_images(row['downloaded_pdf_path'], png_dir)
            # Append the file path to the list
            downloaded_png_paths.append(png_path)
        else:
            # If the URL doesn't end with '.pdf', append None
            downloaded_png_paths.append(None)
    except:
        downloaded_png_paths.append(None)

# Add the downloaded PDF paths as a new column to the DataFrame
clean_filtered_df['downloaded_png_path'] = downloaded_png_paths

In [14]:
clean_filtered_df.head()

Unnamed: 0,_id,pageNumber,reference,document,title,timestamp,__v,year,downloaded_pdf_path,downloaded_png_path
0,6623ba0dfaf3e17e8b6016d6,1,TED/DIR/CON/GOM/001/073,https://cbn.gov.ng/Out/2024/CCD/Sales of FX to...,Sales of FX to BDCs to Meet Retail Market Dema...,2024-04-08,0,2024,downloaded_pdfs/Sales of FX to BDCs to Meet Re...,png_files/Sales of FX to BDCs to Meet Retail M...
1,6623ba0dfaf3e17e8b6016d7,1,BSD/DIR/PUB/LAB/017/004,https://cbn.gov.ng/Out/2024/CCD/LETTER TO ALL ...,Letter to all Banks- The Use of Foreign- Curre...,2024-04-08,0,2024,downloaded_pdfs/LETTER TO ALL BANKS- THE USE O...,png_files/LETTER TO ALL BANKS- THE USE OF FORE...
2,6623ba0dfaf3e17e8b6016d8,1,FPR/DIR/PUB/CIR/002/009,https://cbn.gov.ng/Out/2024/CCD/Recapitalizati...,Review of Minimum Capital Requirements for Com...,2024-03-28,0,2024,downloaded_pdfs/Recapitalization_MARCH_2024.pdf,
7,6623ba0dfaf3e17e8b6016dd,1,CCD/FAQ/01/16-03-2024,https://cbn.gov.ng/Out/2024/CCD/FAQs for ferti...,Partnership to Tackle Food Inflation: Fertili...,2024-03-16,0,2024,downloaded_pdfs/FAQs for fertilisers02.pdf,png_files/FAQs for fertilisers02.png
8,6623ba0dfaf3e17e8b6016de,1,BSD/DIR/PUB/LAB/017/003,https://cbn.gov.ng/Out/2024/CCD/RE IMPACT OF R...,Re: Impact of Recent Policy Reforms-Prudential...,2024-03-14,0,2024,downloaded_pdfs/RE IMPACT OF RECENT POLICY REF...,png_files/RE IMPACT OF RECENT POLICY REFORMS-P...


__Extract texts from PNG__

In [17]:
clean_filtered_df

Unnamed: 0,_id,pageNumber,reference,document,title,timestamp,__v,year,downloaded_pdf_path,downloaded_png_path
0,6623ba0dfaf3e17e8b6016d6,1,TED/DIR/CON/GOM/001/073,https://cbn.gov.ng/Out/2024/CCD/Sales of FX to...,Sales of FX to BDCs to Meet Retail Market Dema...,2024-04-08,0,2024,downloaded_pdfs/Sales of FX to BDCs to Meet Re...,png_files/Sales of FX to BDCs to Meet Retail M...
1,6623ba0dfaf3e17e8b6016d7,1,BSD/DIR/PUB/LAB/017/004,https://cbn.gov.ng/Out/2024/CCD/LETTER TO ALL ...,Letter to all Banks- The Use of Foreign- Curre...,2024-04-08,0,2024,downloaded_pdfs/LETTER TO ALL BANKS- THE USE O...,png_files/LETTER TO ALL BANKS- THE USE OF FORE...
2,6623ba0dfaf3e17e8b6016d8,1,FPR/DIR/PUB/CIR/002/009,https://cbn.gov.ng/Out/2024/CCD/Recapitalizati...,Review of Minimum Capital Requirements for Com...,2024-03-28,0,2024,downloaded_pdfs/Recapitalization_MARCH_2024.pdf,
7,6623ba0dfaf3e17e8b6016dd,1,CCD/FAQ/01/16-03-2024,https://cbn.gov.ng/Out/2024/CCD/FAQs for ferti...,Partnership to Tackle Food Inflation: Fertili...,2024-03-16,0,2024,downloaded_pdfs/FAQs for fertilisers02.pdf,png_files/FAQs for fertilisers02.png
8,6623ba0dfaf3e17e8b6016de,1,BSD/DIR/PUB/LAB/017/003,https://cbn.gov.ng/Out/2024/CCD/RE IMPACT OF R...,Re: Impact of Recent Policy Reforms-Prudential...,2024-03-14,0,2024,downloaded_pdfs/RE IMPACT OF RECENT POLICY REF...,png_files/RE IMPACT OF RECENT POLICY REFORMS-P...
...,...,...,...,...,...,...,...,...,...,...
268,6623ba0dfaf3e17e8b6017e2,14,FPR/DIR/PUB/CIR/01/064,https://cbn.gov.ng/Out/2023/FPRD/Circular on G...,Guidance on Ultimate Beneficial Owners of Lega...,2023-01-13,0,2023,downloaded_pdfs/Circular on Guidance Ultimate ...,png_files/Circular on Guidance Ultimate Benefi...
269,6623ba0dfaf3e17e8b6017e3,14,BSD/DIR/GEN/DLR/001/002,https://cbn.gov.ng/Out/2023/BSD/WEEKLY INTERES...,Deposit and Lending Rates in the Banking Indus...,2023-01-13,0,2023,downloaded_pdfs/WEEKLY INTEREST RATES AS AT JA...,png_files/WEEKLY INTEREST RATES AS AT JANUARY ...
270,6623ba0dfaf3e17e8b6017e4,14,OFI/DOA/CON/OFI/001/304,https://cbn.gov.ng/Out/2023/OFISD/Prohibition ...,Letter to all OFIs : Prohibition of Placement/...,2023-01-09,0,2023,downloaded_pdfs/Prohibition of Placements in F...,png_files/Prohibition of Placements in Funds M...
271,6623ba0dfaf3e17e8b6017e5,14,BSD/DIR/GEN/DLR/001/001,https://cbn.gov.ng/Out/2023/BSD/WEEKLY INTERES...,Deposit and Lending Rates in the Banking Indus...,2023-01-06,0,2023,downloaded_pdfs/WEEKLY INTEREST RATES AS AT JA...,png_files/WEEKLY INTEREST RATES AS AT JANUARY ...


In [19]:
clean_filtered_df.to_csv("2023_2024_policy_circulars.csv", index=False)