# Step 1: Setup and Imports


Set up the Python environment and imports necessary libraries for web scraping, data manipulation, and visualization

In [50]:
# KNBS Web Scraping and Analysis
# Author: Hellen Mati 

# Import required libraries
import requests
import pandas as pd
import matplotlib.pyplot as plt
import os
from bs4 import BeautifulSoup

# Display a confirmation message
print("Libraries imported successfully!")



Libraries imported successfully!


# Step 2: Define the Target Website


Defines the base URL for the Kenya National Bureau of Statistics (KNBS) and the specific page containing publications.

In [55]:

# Base  URL of the KNBS Leading Economic Indicators page

base_url = "https://www.knbs.or.ke"
url = "https://www.knbs.or.ke/leading-economic-indicators/"


# Print the URL to confirm

print(f"Target URL: {url}")


Target URL: https://www.knbs.or.ke/leading-economic-indicators/


# Step 3: Scrape Data from KNBS Website

Fetches the HTML content of the publications page.
Extracts publication titles and links using BeautifulSoup.
Saves the scraped data to a CSV file for further analysis.

In [43]:

# Directory to save PDFs
output_dir = "Indicators"

# Create the folder if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Send an HTTP GET request to the KNBS Leading Economic Indicators page
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    print("Successfully accessed the KNBS Leading Economic Indicators page!")
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all links on the page
    links = soup.find_all("a")
    
    # Filter for PDF links
    pdf_links = []
    for link in links:
        href = link.get("href")
        if href and href.endswith(".pdf"):  # Check if the link ends with ".pdf"
            pdf_links.append(href)
    
    # Display the number of PDFs found
    print(f"Found {len(pdf_links)} PDF links.")
    
    # Download each PDF
    for pdf_url in pdf_links:
        try:
            # Extract the file name from the URL
            file_name = pdf_url.split("/")[-1]
            file_path = os.path.join(output_dir, file_name)
            
            # Download the PDF
            pdf_response = requests.get(pdf_url, stream=True)
            if pdf_response.status_code == 200:
                with open(file_path, "wb") as pdf_file:
                    pdf_file.write(pdf_response.content)
                print(f"Downloaded: {file_name}")
            else:
                print(f"Failed to download {file_name} (HTTP {pdf_response.status_code})")
        except Exception as e:
            print(f"Error downloading {pdf_url}: {e}")

else:
    print(f"Failed to access the page. HTTP Status Code: {response.status_code}")


Successfully accessed the KNBS Leading Economic Indicators page!
Found 22 PDF links.
Downloaded: Leading-Economic-Indicators-September-2024_1.pdf
Downloaded: Leading-Economic-Indicators-August-2024.pdf
Downloaded: Leading-Economic-Indicators-July-2024.pdf
Downloaded: Leading-Economic-Indicators-June-2024.pdf
Downloaded: Leading-Economic-Indicators-May-2024.pdf
Downloaded: Leading-Economic-Indicators-April-2024.pdf
Downloaded: leading-economic-indicator-march-2024-1.pdf
Downloaded: Leading-Economic-Indicator-February-2024.pdf
Downloaded: Leading-Economic-Indicators-January-2024.pdf
Downloaded: Kenya-Leading-Economic-Indicators-December-2023.pdf
Downloaded: Kenya-Leading-Economic-Indicators-November-2023.pdf
Downloaded: Kenya-Leading-Economic-Indicators-October-2023.pdf
Downloaded: Kenya-Leading-Economic-Indicators-September-2023.pdf
Downloaded: Kenya-Leading-Economic-Indicators-August-2023.pdf
Downloaded: Kenya-Leading-Economic-Indicators-July-2023.pdf
Downloaded: Kenya-Leading-Economic

# Step 4: Explore Scraped Data


Displays a sample of the scraped data in the notebook for verification.

In [58]:
pip install PyPDF2 pdfplumber


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-win_amd64.whl.metadata (48 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   - ------

 * Extract Text from PDF Files 
You can use pdfplumber or PyPDF2 to extract text from your PDFs. Here's how you can extract the text from each PDF in the folder.

In [65]:
import pdfplumber
import os

# Path to your downloaded PDFs folder
pdf_folder = "Indicators"

# Loop through each PDF file in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        try:
            # Open the PDF and extract text
            with pdfplumber.open(pdf_path) as pdf:
                text = ""
                for page_num, page in enumerate(pdf.pages[:2]):  # Preview first 2 pages
                    text += page.extract_text()
                
                # Print the preview of the text
                print(f"Preview of {pdf_file} (First 2 pages):\n")
                print(text[:1000])  # Displaying first 1000 characters of the text
                print("\n" + "-"*80 + "\n")
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")


Preview of Kenya-Leading-Economic-Indicators-April-2023.pdf (First 2 pages):

Leading Economic Indicators
APRIL 2023
2
1 2 .0
1 0 .0
8 .0
6 .0
4 .0
2 .0
-
A
6 .7 6 .5
5 .0
p r-2 2
7 .3
7 .1
5 .3
M a y-2
8
7
2
.2
.9
5 .7
Ju n -2
8
2
.7
8 .3
6 .0
Lo w
Ju l-2
e
2
r In co m
9 .2
8 .5
6 .2
A
e
u g -2
I
2
n f l a t i o n r a t
U p p e r In co m e
1 0 .2
9 .8
9 .6 9 .2
7 .1 6 .9
S ep -2 2 O ct-2 2
e
K e n ya
1 0 .4
9 .5
6 .9
N o v-2 2
In
6
fla tio n ra te
9 .2
9 .1
.9
D e c-2 2
9
7
.1
9 .0
.1
Ja n -2 3
9
9 .2
.2
7 .1
F eb -2 3
9 .2
M a
9 .0
6 .7
r-2 3
7 .9
7 .7
5 .6
A p r-2 3Page Number
List of Tables
Table 1(a): Consumer Price Indices 8
Table 1(b): Inflation Rates 9
Table 2: Mean Monthly Foreign Exchange Rates of Kenyan Shilling against Selected Major Currencies 10
Table 3: Interest rate (%) 11
Table 4: Nairobi Securities Exchange 12
Table 5(a): Money Supply 13
Table 5(b): Gross Foreign Exchange Reserves 14
Table 6: Coffee Sales and Prices 15
Table 7: Tea Production and Auction Prices 16
Tab

In [None]:
import pdfplumber
import os
import pandas as pd

# Path to your folder containing PDFs
pdf_folder = "Indicators"

# List to hold all data frames (one for each table)
all_tables_data = []

# Loop through each PDF file in the folder
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.lower().endswith(".pdf"):  # Check for PDF files
        pdf_path = os.path.join(pdf_folder, pdf_file)
        print(f"Processing file: {pdf_file}")
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    # Extract tables from the page
                    tables = page.extract_tables(
                        table_settings={"vertical_strategy": "lines", "horizontal_strategy": "text"}
                    )
                    if tables:
                        for table in tables:
                            # Safeguard against tables with missing headers or inconsistent structures
                            if len(table) > 1:  # Ensure there's at least a header row and some data
                                header = table[0]
                                data = table[1:]
                                df = pd.DataFrame(data, columns=header)
                                df['Source_File'] = pdf_file  # Add source file column
                                all_tables_data.append(df)
                            else:
                                print(f"Skipping empty or malformed table on page {page_num + 1}.")
                    else:
                        print(f"No tables found on page {page_num + 1} of {pdf_file}.")
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

# Combine all dataframes into one
if all_tables_data:
    try:
        combined_df = pd.concat(all_tables_data, ignore_index=True)
        output_csv_path = "kenya_economic_indicators_combined.csv"
        combined_df.to_csv(output_csv_path, index=False)
        print(f"Data combined and saved to '{output_csv_path}'.")
    except Exception as e:
        print(f"Error combining data frames: {e}")
else:
    print("No tables were extracted from the PDFs.")


Processing file: Kenya-Leading-Economic-Indicators-April-2023.pdf
No tables found on page 1 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 2 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 3 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 4 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 5 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 6 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 7 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 8 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 11 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 15 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 18 of Kenya-Leading-Economic-Indicators-April-2023.pdf.
No tables found on page 19 of Kenya-Leading-Economic-Indicator

# Step 5: Analyze the Data

