# Scraping weather data 

### The role of this notebook is to download weather data for the 100 largest cities in Poland. 

### Notebook configuration 

In [62]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from dotenv import  load_dotenv
import os
import csv
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.service import Service as FirefoxService
import time
from datetime import datetime

### Loading API key from .env file

In [63]:
load_dotenv("D:\Python\devcontainer\.env") #loading API key from .env file

True

In [64]:
APIKEY_OWM = os.getenv("OWM_API_KEY")

### Downloading the city names of the 100 largest cities in Poland

In [65]:
url_city_name = 'https://www.jetpunk.com/user-quizzes/1444759/100-najwiekszych-miast-w-polsce/stats'

In [66]:
web = requests.get(url_city_name)
soup = BeautifulSoup(web.text, 'html.parser')

In [67]:
rows = soup.select('table.super-table tr')[1:]

In [68]:
cities = []

for row in rows:
    columns = row.find_all('td')
    if len(columns) > 1:
        city_name = columns[1].text.strip()
        cities.append(city_name)

In [69]:

with open('cities.csv', 'w', newline='', encoding='utf-8') as cities_names:
    write = csv.writer(cities_names)
    write.writerow(['City Name'])
    for city in cities:
        write.writerow([city])

### Downloading coordinates of cities using the selenium

In [70]:
cities_coordinates_width = []
cities_coordinates_length = []

In [71]:
cities_names_csv = []

with open('cities.csv', 'r', encoding='utf-8') as cities_names:
    next(cities_names)
    for single_line in cities_names:
        cities_names_csv.append(single_line.strip())

Downloading data can take about 13 minutes, this is due to the long response time of the website

In [72]:
service = FirefoxService(r'.\geckodriver.exe') 
browser = Firefox(service=service) 

browser.get('https://bazamiejscowosci.pl')

for city in cities_names_csv:
    search_box = browser.find_element('id', 'map-search')
    search_box.click()
    
    search_box.send_keys(city)
    
    time.sleep(3)

    button = browser.find_element('class name', 'ui-menu-item')
    button.click()
    
    time.sleep(2)

    get_coordinates = browser.find_element('id', 'city-decimal')

    coordinates = str(get_coordinates.text).split(', ')

    cities_coordinates_width.append(coordinates[0])
    cities_coordinates_length.append(coordinates[1])
    
    time.sleep(2)
    
    browser.get('https://bazamiejscowosci.pl')

browser.quit()

### Saving a file with coordinates of cities

In [73]:
df_coordinates = pd.read_csv('cities.csv')

In [74]:
df_coordinates['width'] = cities_coordinates_width

df_coordinates['length'] = cities_coordinates_length

df_coordinates['City ID'] = df_coordinates.index

df_coordinates.to_csv('cities.csv', index=False, encoding='utf-8')

### Downloading weather data using the API

In [75]:
def get_data(lat, lon, APIKEY_OWM):
    response = requests.get(f'https://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&exclude=current&appid={APIKEY_OWM}&units=metric')
    currency_data = response.json()
    return currency_data

In [76]:
weather_df = pd.DataFrame(columns=['City ID', 'Date', 'Time', 'Temperature', 'Pressure', 'Humidity', 'main', 'description', 'Clouds', 'speed', 'deg', 'Rain'])

In [77]:
def data_to_pandas(data, df, city_id, przelicznik):
    
    for time in range(0, 40):
        
        row = time + (40 * przelicznik)
        
        # Date and time
        
        weather_date = data['list'][time]['dt']
        utc_date = datetime.utcfromtimestamp(weather_date).strftime('%Y-%m-%d %H:%M:%S')
        utc_date = utc_date.split(' ') 
        df.loc[row, ['Date', 'Time']] = [utc_date[0], utc_date[1]]
        
        # Temperature	Pressure	Humidity
        
        info = data['list'][time]['main']
        df.loc[row, ['Temperature', 'Pressure', 'Humidity']] = [info['temp'], info['pressure'], info['humidity']]
        
        # General	Description
            
        for category in ['main', 'description']:
            df.loc[row, [category]] = data['list'][time]['weather'][0][category]
        
        # Clouds
        
        df.loc[row, ['Clouds']] = data['list'][time]['clouds']['all']
        
        # Wind Speed	Wind Deg
        
        for category in ['speed', 'deg']:
            df.loc[row, [category]] = data['list'][time]['wind'][category]

        #Rain
        
        try:
            weather_rain = data['list'][time]['rain']['3h']
        except KeyError:
            df.loc[row, ['Rain']] = 'No rain'
        else:   
            df.loc[row, ['Rain']] = weather_rain
            
        df.loc[row, ['City ID']] = city_id
    

In [78]:
cities_data = pd.read_csv('cities.csv')

In [79]:
x = 0

for city in range(0,100):
    city_name = str(cities_data.loc[city, 'City Name'])
    width = float(cities_data.loc[city, 'width'])
    length = float(cities_data.loc[city, 'length'])
    
    city_id = cities_data.index[city]

    
    cities_raw_data = get_data(lat=width, lon=length, APIKEY_OWM=APIKEY_OWM)
    
    data_to_pandas(data=cities_raw_data, df=weather_df, city_id=city_id, przelicznik=x)

    x += 1


### Merging dataframes 

In [80]:
merged_weather_df = pd.merge(weather_df, cities_data, on='City ID', how='left')

### Saving the merged weather data file

In [81]:
merged_weather_df.to_csv('weather_data.csv', index=False)