In [1]:
import pandas as pd

import requests
import json
from bs4 import BeautifulSoup

from fuzzywuzzy import process

In [2]:
# I'll use a processed Data Set from an earlier notebook as the stardard for Country Names
df = pd.read_csv("..\Data_Sets\processed\economicData_1995-2022.csv") 
df_refNames = pd.DataFrame({    
    'Standard Names': df['Country Name'].unique()
})

In [3]:
def fuzzySearchName(name, refNames=df_refNames, score_threshold = 85):
    '''
        Given a country name, returns the closest match from refNames using fuzzy search.
        If no close match is found, it returns "PLEASE FILL MANUALLY".
    '''
    
    # If the name is empty or null, return "UNKNOWN"
    if pd.isna(name) or not name.strip():
        return "UNKNOWN"

    exceptions = {
        "Kyrgyzstan": "Kyrgyz Republic",
        "Congo": "Republic of Congo",
        "Czechia": "Czech Republic",
        "Slovakia": "Slovak Republic",
        "Macao": "Macau",
        'Democratic Republic Of The Congo': 'Democratic Republic of Congo',
        'Republic Of The Congo': 'Republic of Congo'
    }
    
    if name in exceptions:
        return exceptions[name]
    
    match, score, _ = process.extractOne(name, refNames['Standard Names'])
    
    # Hardcoding exceptions, due to sharing common words (South, North)
    dubiousFuzzyNames = [
        'North Korea', 'South Korea', 'South Africa',
        'North Macedonia', 'Saint Vincent and the Grenadines'
    ]

    if match in dubiousFuzzyNames:
        score_threshold = max(95, score_threshold)
        

    # If a close match is found, return the match
    if score > score_threshold:
        return match
    
    # If no close match is found, ask for manual input
    return "PLEASE FILL MANUALLY"

In [4]:
url = 'https://weatherandclimate.com/countries'

response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


tableRows = soup.find('table').findAll('tr')

list_countries = []
list_climateType = []
list_avg_temp = []

for row in tableRows[1:]:
    countryName = row.find('a').text
    climateType = str(row.findAll('td')[1].text)[0:2]
    
    swapDict = {
        'AS': 'AW',
        'DS': 'DF',
        'DW': 'DF',
        'EF': 'DF',
        'ET': 'DF',
        'CS': 'CW'
    }

    if climateType in swapDict:
        climateType = swapDict[climateType]


    try:
        avg_temp = float(row.findAll('td')[3].text)
    except:
        avg_temp = 0 # Doesn't matter, we'll exclude this country later

    list_countries.append(countryName)
    list_climateType.append(climateType)
    list_avg_temp.append(avg_temp)

In [5]:
df_climateType = pd.DataFrame({
    'Country Name': list_countries,
    'Climate Type': list_climateType,
    'Average Temperature (C)': list_avg_temp
})

In [6]:
# Excluding non-relevant countries to our DataSet

removeList = [
    'British Indian Ocean Territory', 'Africa', 'Europe', 'North America', 'South America', 'Asia', 'Oceania',
    'Sint Maarten (Dutch part)', 'Saint Martin (French Part)', 'Antigua and Barbuda',
    'Netherlands Antilles', 'Northern Mariana Islands', 'Saint Barthelemy',
    'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis',
    'Serbia-Montenegro', "Turks and Caicos Islands", 'United States Virgin Islands',
    'Wallis and Futuna', 'Yugoslavia', 'Cocos (Keeling) Islands', 'Isle Of Man', 'South Georgia And South Sandwich Islands',
    'South Sudan','United States Minor Outlying Islands','Saint Pierre And Miquelon', 'American Samoa',
    ]

df_climateType['Country Name'] = df_climateType['Country Name'].str.strip().str.title()
removeList = [country.title() for country in removeList]

mask = ~df_climateType['Country Name'].isin(removeList)
df_climateType = df_climateType[mask].copy()


display(f'% of included countries in mask: ', mask.mean())

'% of included countries in mask: '

0.943089430894309

In [7]:
df_climateType['New Country Name'] = df_climateType['Country Name'].apply(fuzzySearchName)

# removing unmatched fuzzySearch names
mask = df_climateType['New Country Name'] == 'PLEASE FILL MANUALLY'
df_climateType = df_climateType[~mask]

# Checking for duplicated countries (top freq should be 1)
display(df_climateType['New Country Name'].describe(include=['object']))

count      184
unique     183
top       Mali
freq         2
Name: New Country Name, dtype: object

In [8]:
pd.set_option('display.max_rows', 350)
pd.set_option('display.min_rows', 135)
pd.set_option('display.max_columns', None)

# Fixing col names
df_climateType['Country Name'] = df_climateType['New Country Name']
df_climateType = df_climateType[list(df_climateType.columns[0:-1])]



In [9]:
df_climateType.head()

Unnamed: 0,Country Name,Climate Type,Average Temperature (C)
0,Afghanistan,DF,15.7
1,Albania,CW,15.17
2,Algeria,BW,20.0
5,Angola,AW,24.37
8,Argentina,CF,18.24


In [10]:
df_climateType.groupby('Climate Type').describe(include=['object'])

Unnamed: 0_level_0,Country Name,Country Name,Country Name,Country Name
Unnamed: 0_level_1,count,unique,top,freq
Climate Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AF,22,22,Brunei Darussalam,1
AM,17,17,Barbados,1
AW,29,29,Angola,1
BS,16,16,Azerbaijan,1
BW,26,25,Mali,2
CF,29,29,Argentina,1
CW,20,20,Albania,1
DF,25,25,Afghanistan,1


In [11]:
df_climateType.to_csv(f'..\Data_Sets\processed\\addData_climateData.csv', index=False)