### Websites to scrap

Global intentional homicide rates
https://dataunodc.un.org/dp-intentional-homicide-victims


Global Crime Index:
https://www.numbeo.com/crime/rankings_by_country.jsp

Global Health index:
https://www.numbeo.com/health-care/rankings_by_country.jsp

GDP per capita index:
https://wits.worldbank.org/CountryProfile/en/Country/BY-COUNTRY/StartYear/2010/EndYear/2022/Indicator/NY-GDP-PCAP-CD


In [52]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import time
from selenium.webdriver.chrome.options import Options
import os
import pandas as pd

In [56]:
webdriver_path = r"f:\necessary files 2\chromedriver-win64\chromedriver-win64\chromedriver.exe"


## Global Intentional Homicide Rate

This one has a downloadable dataset which we can use to analyze. So no need to scrap it differently. 

So, what we can do is use the given download link url, open it using request, then it will download in our current file location

In [58]:
import requests

# URL of the Excel file
file_url = "https://dataunodc.un.org/sites/dataunodc.un.org/files/data_cts_intentional_homicide.xlsx"

# Download the file
response = requests.get(file_url)
if response.status_code == 200:
    with open("data_cts_intentional_homicide.xlsx", "wb") as file:
        file.write(response.content)
    print("Download successful: data_cts_intentional_homicide.xlsx")
else:
    print("Download failed!")


Download successful: data_cts_intentional_homicide.xlsx


## Global Crime Index

In [10]:
gci_url = "https://www.numbeo.com/crime/rankings_by_country.jsp?title=2024&displayColumn=0"

In [11]:
service = Service(webdriver_path)
driver = webdriver.Chrome(service=service)
driver.get(gci_url)

try:
    # Wait for table to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "t2"))
    )

    # Find the table
    table = driver.find_element(By.ID, "t2")
    
    # Extract headers
    headers = [header.text for header in table.find_elements(By.TAG_NAME, "th")]
    
    # Prepare data list
    data = []
    
    # Extract rows
    for row in table.find_elements(By.CSS_SELECTOR, "tbody tr"):
        cols = row.find_elements(By.TAG_NAME, "td")
        
        # Handle potential missing data
        try:
            rank = cols[0].text
            country = cols[1].text
            crime_index = cols[2].text
            data.append([rank, country, crime_index])
        except IndexError:
            continue

    # Create DataFrame
    df = pd.DataFrame(data, columns=headers)
finally:
    driver.quit()

In [14]:
df.head(50)

Unnamed: 0,Rank,Country,Crime Index
0,1,Venezuela,81.2
1,2,Papua New Guinea,79.7
2,3,Afghanistan,78.3
3,4,Haiti,77.9
4,5,South Africa,75.4
5,6,Honduras,73.4
6,7,Trinidad And Tobago,70.8
7,8,Syria,69.1
8,9,Yemen,68.6
9,10,Jamaica,68.1


In [15]:
gci_df_2024=df.copy()

In [17]:
service = Service(webdriver_path)
driver = webdriver.Chrome(service=service)

# Store all data
all_data = []

# Loop through years
for year in range(2012, 2024):
    # Adjust URL format
    if year in [2012, 2013]:
        gci_url = f"https://www.numbeo.com/crime/rankings_by_country.jsp?title={year}-Q1&displayColumn=0"
    else:
        gci_url = f"https://www.numbeo.com/crime/rankings_by_country.jsp?title={year}&displayColumn=0"
    
    driver.get(gci_url)

    try:
        # Wait for table to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "t2"))
        )

        # Find the table
        table = driver.find_element(By.ID, "t2")
        
        # Extract rows
        for row in table.find_elements(By.CSS_SELECTOR, "tbody tr"):
            cols = row.find_elements(By.TAG_NAME, "td")
            
            try:
                rank = cols[0].text
                country = cols[1].text
                crime_index = cols[2].text
                all_data.append([year, rank, country, crime_index])
            except IndexError:
                continue

    except Exception as e:
        print(f"Error scraping {year}: {e}")

# Quit driver
driver.quit()

# Create DataFrame
gci_df = pd.DataFrame(all_data, columns=["Year", "Rank", "Country", "Crime Index"])


In [44]:
gci_df.to_csv('raw_gci_data.csv',index=False)

## Global Health Index

In [46]:
service = Service(webdriver_path)
driver = webdriver.Chrome(service=service)

# Store all data
all_data = []

# Loop through years
for year in range(2012, 2024):
    # Adjust URL format
    if year in [2012, 2013]:
        ghi_url = f"https://www.numbeo.com/health-care/rankings_by_country.jsp?title={year}-Q1&displayColumn=0"
    else:
        ghi_url = f"https://www.numbeo.com/health-care/rankings_by_country.jsp?title={year}&displayColumn=0"
    
    driver.get(ghi_url)

    try:
        # Wait for table to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "t2"))
        )

        # Find the table
        table = driver.find_element(By.ID, "t2")
        
        # Extract rows
        for row in table.find_elements(By.CSS_SELECTOR, "tbody tr"):
            cols = row.find_elements(By.TAG_NAME, "td")
            
            try:
                rank = cols[0].text
                country = cols[1].text
                crime_index = cols[2].text
                all_data.append([year, rank, country, crime_index])
            except IndexError:
                continue

    except Exception as e:
        print(f"Error scraping {year}: {e}")

# Quit driver
driver.quit()

# Create DataFrame
ghi_df = pd.DataFrame(all_data, columns=["Year", "Rank", "Country", "Crime Index"])


In [48]:
ghi_df.to_csv("raw_ghi_data.cvs",index=False)

## GDP Per Capita 
Just like Global intentional Per Capita index it has a downloadable excel file.
But the download link is not direct. So we have to use selenium in this case.
So we made a dynamic approach to just by running the below cell our excel file will be downloaded

In [53]:
# Set Chrome options to handle downloads
download_dir = os.getcwd()  # Save to current working directory
chrome_options = Options()
prefs = {"download.default_directory": download_dir, "directory_upgrade": True, "safebrowsing.enabled": True}
chrome_options.add_experimental_option("prefs", prefs)

# Start WebDriver
service = Service(webdriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open website
url = "https://wits.worldbank.org/CountryProfile/en/Country/BY-COUNTRY/StartYear/2010/EndYear/2022/Indicator/NY-GDP-PCAP-CD#"
driver.get(url)

try:
    # Wait for the download button to be clickable
    download_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, "DataDownload"))
    )
    download_button.click()

    # Wait for Excel option to appear and click it
    excel_download = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "li.excel a"))
    )
    excel_download.click()

    # Wait for download to complete
    time.sleep(10)  # Adjust if needed

    # Find downloaded file
    downloaded_file = None
    for file in os.listdir(download_dir):
        if file.endswith(".xlsx"):
            downloaded_file = file
            break

    if downloaded_file:
        print(f"Download successful: {downloaded_file}")
    else:
        print("Download failed!")

except Exception as e:
    print(f"Error: {e}")

finally:
    driver.quit()

Download successful: WITS-Country-Timeseries.xlsx
