In [None]:
"""
 https://www.bambooweekly.com/government-corruption/ 
 https://www.bambooweekly.com/government-corruption-436/ 
 https://github.com/JoergEm/Bamboo-Weekly/tree/main 
"""

In [None]:
# Imports
from IPython.display import FileLink, Markdown, display
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
import seaborn as sns
display(Markdown("Imports ‚úÖ"))

In [None]:
# SparkSession
spark = SparkSession.builder \
    .appName("Government Corruption") \
    .getOrCreate()

display(Markdown("SparkSession ‚úÖ"))

In [None]:
# Function creating local folders
def create_folders(folders: list[str]) -> bool:
    try:
        for folder in folders:
            folderpath: str = os.path.join(os.getcwd(), folder)
            if not os.path.exists(folderpath):
                os.makedirs(folderpath, exist_ok=True)
    except:
        print("Error ‚ùå")
        return False
    else:
        display(Markdown("Folders ‚úÖ"))
        return True

In [None]:
# Function downloading data locally
def download_data(url: str, filename: str) -> bool:
    from urllib.request import urlretrieve
    from urllib.error import HTTPError
    try:
        urlretrieve(url, filename)
        return True
    except HTTPError as e:
        if e.code == 403:
            import requests
            try:
                response: requests.Response = requests.get(url)
                with open(filename, 'wb') as f:
                    f.write(response.content)
                    return True
            except:
                print("Error ‚ùå")
                return False
    return False

In [None]:
#  Links and folders to recieve data and read into DataFrame
url: str = 'https://images.transparencycdn.org/images/CPI2022_GlobalResultsTrends.xlsx'
filename: str = 'CPI2022_GlobalResultsTrends.xlsx'
folders: list[str] = ['data', 'results']
filepath: str = os.path.join(folders[0], filename)
create_folders(folders)

if not os.path.exists(filepath):
    if download_data(url, filepath):
        data: pd.DataFrame = pd.read_excel(filepath, sheet_name="CPI 2022 (final)", skiprows=2)
        display(Markdown("Data ‚úÖ"))
    else:
        display(Markdown("Error ‚ùå"))
else:
    data: pd.DataFrame = pd.read_excel(filepath, sheet_name="CPI 2022 (final)", skiprows=2)
    display(Markdown("Data loaded from existing file üìÅ"))  
        
if os.path.exists(filepath):
    display(FileLink(filepath))

df = spark.createDataFrame(data)
display(Markdown("Data converted to Spark DataFrame ‚úÖ"))

In [None]:
# According to Transparency International, what five countries were least corrupt in 2022?
df.orderBy("Rank").select("Rank", "Country / Territory").show(5, truncate=False)

In [None]:
# According to the same data, what five countries were most corrupt in 2022?
df.orderBy(F.desc("Rank")).select("Rank", "Country / Territory").show(5, truncate=False)