## Weather Web Scraper
### by Kola Ademola.

In [1]:
#import neccesary libraries

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests as rq
from datetime import datetime

In [2]:
#load dataset
countries = pd.read_csv('west_africa.csv')

#view dataset
countries

Unnamed: 0,country,capital
0,Benin,Porto-Novo
1,Burkina-Faso,Ouagadougou
2,Cape-Verde,Praia
3,Cote-Divoire,Yamoussoukro
4,Gambia,Banjul
5,Ghana,Accra
6,Guinea,Conakry
7,Guinea-Bissau,Bissau
8,Liberia,Monrovia
9,Mali,Bamako


In [3]:
#create empty lists to hold scraped data
weathers = []
temps = []
location = []

#loop through the countries and get their weather data
for index, row in countries.iterrows():
    country = row['country']
    capital = row['capital']
        
    url = 'https://www.timeanddate.com/weather/{}/{}'.format(country, capital)
    page = rq.get(url)
    
    soup = bs(page.content, 'html.parser')
    
    temp = soup.find('div', class_ = 'h2')
    
    weather = soup.find('p')
    
    if  weather is not None and temp is not None:
        location.append(country)
        weathers.append(weather.text)
        temps.append(temp.text)
    else:
        temps.append("N/A")
        location.append(country)
        weathers.append("N/A")

In [4]:
# get current date
current_date = datetime.now().date()

#format date from string to correct data type
formatted_date = current_date.strftime('%Y-%m-%d')

#load and fill dataframe with the data
data = {'Country': location, 'Current_Weather': weathers, 'Current_Temperature(F)': temps, 'Date' : formatted_date}

#create dataframe to hold the data
df = pd.DataFrame(data)

#view weather scraped data
df

Unnamed: 0,Country,Current_Weather,Current_Temperature(F),Date
0,Benin,Passing clouds.,90 °F,2024-02-29
1,Burkina-Faso,Passing clouds.,88 °F,2024-02-29
2,Cape-Verde,Passing clouds.,79 °F,2024-02-29
3,Cote-Divoire,Partly sunny.,80 °F,2024-02-29
4,Gambia,Clear.,84 °F,2024-02-29
5,Ghana,Passing clouds.,91 °F,2024-02-29
6,Guinea,Haze.,90 °F,2024-02-29
7,Guinea-Bissau,Duststorm.,95 °F,2024-02-29
8,Liberia,Thunderstorms. Partly sunny.,77 °F,2024-02-29
9,Mali,Passing clouds.,95 °F,2024-02-29


* The temperature column will need to be formatted / cleaned to hold just the numeric value.

In [5]:
#duplicating original dataset before cleaning

weather_df = df.copy()

In [6]:
#removing the "degree" from the temperature column

temp = weather_df['Current_Temperature(F)'].str.split(expand = True)

#assign the temperatures to the main dataset
weather_df['Current_Temperature(F)'] = temp[0].astype(int)

#change the Date column to the right data type
weather_df['Date'] = pd.to_datetime(weather_df['Date'], format='%Y-%m-%d')

#view data
weather_df

Unnamed: 0,Country,Current_Weather,Current_Temperature(F),Date
0,Benin,Passing clouds.,90,2024-02-29
1,Burkina-Faso,Passing clouds.,88,2024-02-29
2,Cape-Verde,Passing clouds.,79,2024-02-29
3,Cote-Divoire,Partly sunny.,80,2024-02-29
4,Gambia,Clear.,84,2024-02-29
5,Ghana,Passing clouds.,91,2024-02-29
6,Guinea,Haze.,90,2024-02-29
7,Guinea-Bissau,Duststorm.,95,2024-02-29
8,Liberia,Thunderstorms. Partly sunny.,77,2024-02-29
9,Mali,Passing clouds.,95,2024-02-29


In [7]:
#check data types
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Country                 16 non-null     object        
 1   Current_Weather         16 non-null     object        
 2   Current_Temperature(F)  16 non-null     int64         
 3   Date                    16 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 640.0+ bytes


In [8]:
#saving data to csv file for the first run of scraping
#weather_df.to_csv('west_african_weather.csv', index = False)

In [9]:
# load the existing weather DataFrame
existing_weather_df = pd.read_csv('west_african_weather.csv', parse_dates = ['Date'])

#format current date to be used in if condition to check if data already exist for current date
formatted_date = pd.to_datetime(formatted_date, format = '%Y-%m-%d')

# get available dates from dataset
date_list = existing_weather_df['Date'].unique().tolist()

# Convert timestamps to datetime format
date_list = pd.to_datetime(date_list, unit='ns')

# Format the dates without time stamp
date_list = [d.strftime('%Y-%m-%d') for d in date_list]

# Check if date already exists in DataFrame
if formatted_date in date_list:
    combined_weather_df = existing_weather_df
else:
    # concatenate the new DataFrame with the existing DataFrame
    combined_weather_df = pd.concat([existing_weather_df, weather_df], ignore_index = True)

#change the Date column to the right data type
combined_weather_df['Date'] = pd.to_datetime(combined_weather_df['Date'])

#confirm new data
combined_weather_df.tail(16)

Unnamed: 0,Country,Current_Weather,Current_Temperature(F),Date
5264,Benin,Passing clouds.,90,2024-02-29
5265,Burkina-Faso,Passing clouds.,88,2024-02-29
5266,Cape-Verde,Passing clouds.,79,2024-02-29
5267,Cote-Divoire,Partly sunny.,80,2024-02-29
5268,Gambia,Clear.,84,2024-02-29
5269,Ghana,Passing clouds.,91,2024-02-29
5270,Guinea,Haze.,90,2024-02-29
5271,Guinea-Bissau,Duststorm.,95,2024-02-29
5272,Liberia,Thunderstorms. Partly sunny.,77,2024-02-29
5273,Mali,Passing clouds.,95,2024-02-29


In [10]:
# save the combined DataFrame to a CSV file
combined_weather_df.to_csv('west_african_weather.csv', index = False)