## Weather Web Scraper
### by Kola Ademola

In [53]:
#import neccesary libraries

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests as rq
from datetime import datetime

In [54]:
#load dataset
countries = pd.read_csv('west_africa.csv')

#view dataset
countries

Unnamed: 0,country,capital
0,Benin,Porto-Novo
1,Burkina-Faso,Ouagadougou
2,Cameroon,Yaounde
3,Cape-Verde,Praia
4,Gambia,Banjul
5,Ghana,Accra
6,Guinea,Conakry
7,Guinea-Bissau,Bissau
8,Liberia,Monrovia
9,Mali,Bamako


In [55]:
#create empty lists to hold scraped data
weathers = []
temps = []
location = []

#loop through the countries and get their weather data
for index, row in countries.iterrows():
    country = row['country']
    capital = row['capital']
        
    url = 'https://www.timeanddate.com/weather/{}/{}'.format(country, capital)
    page = rq.get(url)
    
    soup = bs(page.content, 'html.parser')

    temp = soup.find('div', class_ = 'h2')
    weather = soup.find('p')
    location.append(country)
    weathers.append(weather.text)
    temps.append(temp.text)

In [56]:
# get current date
current_date = datetime.now().date()

#format date from string to correct data type
formatted_date = current_date.strftime('%Y-%m-%d')

#load and fill dataframe with the data
data = {'Country': location, 'Current_Weather': weathers, 'Current_Temperature(c)': temps, 'Date' : formatted_date}

#create dataframe to hold the data
df = pd.DataFrame(data)

#view data
df

Unnamed: 0,Country,Current_Weather,Current_Temperature(c),Date
0,Benin,Passing clouds.,32 °C,2023-04-06
1,Burkina-Faso,Haze.,29 °C,2023-04-06
2,Cameroon,Thunderstorms. Partly sunny.,32 °C,2023-04-06
3,Cape-Verde,Partly sunny.,25 °C,2023-04-06
4,Gambia,Clear.,35 °C,2023-04-06
5,Ghana,Passing clouds.,32 °C,2023-04-06
6,Guinea,Sunny.,32 °C,2023-04-06
7,Guinea-Bissau,Clear.,36 °C,2023-04-06
8,Liberia,Partly sunny.,33 °C,2023-04-06
9,Mali,Passing clouds.,38 °C,2023-04-06


* The temperature column will need to be formatted / cleaned to hold just the numeric value.

In [57]:
#duplicating original dataset before cleaning

weather_df = df.copy()

In [58]:
#removing the "degree celcius" from the temperature column

temp = weather_df['Current_Temperature(c)'].str.split(expand = True)

#assign the temperatures to the main dataset
weather_df['Current_Temperature(c)'] = temp[0].astype(int)

#change the Date column to the right data type
weather_df['Date'] = pd.to_datetime(weather_df['Date'], format='%Y-%m-%d')

#view data
weather_df

Unnamed: 0,Country,Current_Weather,Current_Temperature(c),Date
0,Benin,Passing clouds.,32,2023-04-06
1,Burkina-Faso,Haze.,29,2023-04-06
2,Cameroon,Thunderstorms. Partly sunny.,32,2023-04-06
3,Cape-Verde,Partly sunny.,25,2023-04-06
4,Gambia,Clear.,35,2023-04-06
5,Ghana,Passing clouds.,32,2023-04-06
6,Guinea,Sunny.,32,2023-04-06
7,Guinea-Bissau,Clear.,36,2023-04-06
8,Liberia,Partly sunny.,33,2023-04-06
9,Mali,Passing clouds.,38,2023-04-06


In [59]:
#check data types
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Country                 16 non-null     object        
 1   Current_Weather         16 non-null     object        
 2   Current_Temperature(c)  16 non-null     int32         
 3   Date                    16 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int32(1), object(2)
memory usage: 576.0+ bytes


In [60]:
#saving data to csv file for the first run of scraping
#weather_df.to_csv('west_african_weather.csv', index = False)

In [61]:
# load the existing weather DataFrame
existing_weather_df = pd.read_csv('west_african_weather.csv', parse_dates = ['Date'])

#format current date to be used in if condition to check if data already exist for current date
formatted_date = pd.to_datetime(formatted_date, format='%Y-%m-%d')

# Check if date already exists in DataFrame
if formatted_date in existing_weather_df['Date'].values:
    combined_weather_df = existing_weather_df
else:
    # concatenate the new DataFrame with the existing DataFrame
    combined_weather_df = pd.concat([existing_weather_df, weather_df], ignore_index=True)

#change the Date column to the right data type
combined_weather_df['Date'] = pd.to_datetime(combined_weather_df['Date'])

#confirm new data
combined_weather_df

Unnamed: 0,Country,Current_Weather,Current_Temperature(c),Date
0,Benin,Thunderstorms. Passing clouds.,27,2023-04-04
1,Burkina-Faso,Scattered clouds.,31,2023-04-04
2,Cameroon,Passing clouds.,25,2023-04-04
3,Cape-Verde,Passing clouds.,23,2023-04-04
4,Gambia,Clear.,24,2023-04-04
5,Ghana,Passing clouds.,27,2023-04-04
6,Guinea,Passing clouds.,28,2023-04-04
7,Guinea-Bissau,Clear.,23,2023-04-04
8,Liberia,Passing clouds.,24,2023-04-04
9,Mali,Clear.,25,2023-04-04


In [62]:
#check data types
combined_weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Country                 32 non-null     object        
 1   Current_Weather         32 non-null     object        
 2   Current_Temperature(c)  32 non-null     int64         
 3   Date                    32 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 1.1+ KB


In [63]:
# save the combined DataFrame to a CSV file
combined_weather_df.to_csv('west_african_weather.csv', index=False)