## Weather Web Scraper
### by Kola Ademola.

In [1]:
#import neccesary libraries

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests as rq
from datetime import datetime

In [2]:
#load dataset
countries = pd.read_csv('west_africa.csv')

#view dataset
countries

Unnamed: 0,country,capital
0,Benin,Porto-Novo
1,Burkina-Faso,Ouagadougou
2,Cameroon,Yaounde
3,Cape-Verde,Praia
4,Gambia,Banjul
5,Ghana,Accra
6,Guinea,Conakry
7,Guinea-Bissau,Bissau
8,Liberia,Monrovia
9,Mali,Bamako


In [4]:
#create empty lists to hold scraped data
weathers = []
temps = []
location = []

#loop through the countries and get their weather data
for index, row in countries.iterrows():
    country = row['country']
    capital = row['capital']
        
    url = 'https://www.timeanddate.com/weather/{}/{}?unit=c'.format(country, capital)
    page = rq.get(url)
    
    soup = bs(page.content, 'html.parser')

    temp = soup.find('div', class_ = 'h2')
    weather = soup.find('p')
    location.append(country)
    weathers.append(weather.text)
    temps.append(temp.text)

In [5]:
# get current date
current_date = datetime.now().date()

#format date from string to correct data type
formatted_date = current_date.strftime('%Y-%m-%d')

#load and fill dataframe with the data
data = {'Country': location, 'Current_Weather': weathers, 'Current_Temperature(c)': temps, 'Date' : formatted_date}

#create dataframe to hold the data
df = pd.DataFrame(data)

#view scraped data
df

Unnamed: 0,Country,Current_Weather,Current_Temperature(c),Date
0,Benin,Scattered clouds.,32 °C,2023-04-07
1,Burkina-Faso,Passing clouds.,30 °C,2023-04-07
2,Cameroon,Overcast.,33 °C,2023-04-07
3,Cape-Verde,Passing clouds.,27 °C,2023-04-07
4,Gambia,Sunny.,33 °C,2023-04-07
5,Ghana,Scattered clouds.,31 °C,2023-04-07
6,Guinea,Sunny.,32 °C,2023-04-07
7,Guinea-Bissau,Clear.,35 °C,2023-04-07
8,Liberia,Thunderstorms. Partly sunny.,26 °C,2023-04-07
9,Mali,Passing clouds.,38 °C,2023-04-07


* The temperature column will need to be formatted / cleaned to hold just the numeric value.

In [5]:
#duplicating original dataset before cleaning

weather_df = df.copy()

In [6]:
#removing the "degree celcius" from the temperature column

temp = weather_df['Current_Temperature(c)'].str.split(expand = True)

#assign the temperatures to the main dataset
weather_df['Current_Temperature(c)'] = temp[0].astype(int)

#change the Date column to the right data type
weather_df['Date'] = pd.to_datetime(weather_df['Date'], format='%Y-%m-%d')

#view data
weather_df

Unnamed: 0,Country,Current_Weather,Current_Temperature(c),Date
0,Benin,Scattered clouds.,90,2023-04-07
1,Burkina-Faso,Passing clouds.,85,2023-04-07
2,Cameroon,Overcast.,91,2023-04-07
3,Cape-Verde,Passing clouds.,80,2023-04-07
4,Gambia,Sunny.,91,2023-04-07
5,Ghana,Scattered clouds.,88,2023-04-07
6,Guinea,Sunny.,90,2023-04-07
7,Guinea-Bissau,Clear.,95,2023-04-07
8,Liberia,Thunderstorms. Partly sunny.,79,2023-04-07
9,Mali,Passing clouds.,100,2023-04-07


In [7]:
#check data types
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Country                 16 non-null     object        
 1   Current_Weather         16 non-null     object        
 2   Current_Temperature(c)  16 non-null     int64         
 3   Date                    16 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 640.0+ bytes


In [8]:
#saving data to csv file for the first run of scraping
#weather_df.to_csv('west_african_weather.csv', index = False)

In [9]:
# load the existing weather DataFrame
existing_weather_df = pd.read_csv('west_african_weather.csv', parse_dates = ['Date'])

#format current date to be used in if condition to check if data already exist for current date
formatted_date = pd.to_datetime(formatted_date, format='%Y-%m-%d')

# Check if date already exists in DataFrame
if formatted_date in existing_weather_df['Date'].values:
    combined_weather_df = existing_weather_df
else:
    # concatenate the new DataFrame with the existing DataFrame
    combined_weather_df = pd.concat([existing_weather_df, weather_df], ignore_index=True)

#change the Date column to the right data type
combined_weather_df['Date'] = pd.to_datetime(combined_weather_df['Date'])

#confirm new data
combined_weather_df

Unnamed: 0,Country,Current_Weather,Current_Temperature(c),Date
0,Benin,Thunderstorms. Passing clouds.,27,2023-04-04
1,Burkina-Faso,Scattered clouds.,31,2023-04-04
2,Cameroon,Passing clouds.,25,2023-04-04
3,Cape-Verde,Passing clouds.,23,2023-04-04
4,Gambia,Clear.,24,2023-04-04
5,Ghana,Passing clouds.,27,2023-04-04
6,Guinea,Passing clouds.,28,2023-04-04
7,Guinea-Bissau,Clear.,23,2023-04-04
8,Liberia,Passing clouds.,24,2023-04-04
9,Mali,Clear.,25,2023-04-04


In [10]:
#check data types
combined_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Country                 48 non-null     object        
 1   Current_Weather         48 non-null     object        
 2   Current_Temperature(c)  48 non-null     int64         
 3   Date                    48 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 1.6+ KB


In [11]:
# save the combined DataFrame to a CSV file
combined_weather_df.to_csv('west_african_weather.csv', index=False)