In [1]:
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from sqlalchemy import create_engine
import json

In [3]:
# Wind direction should be X - 180 deg where X is the rotation in the html file
with open('Windguru.html','r') as f:
    wind_raw = f.read()

In [4]:
wind_raw = BeautifulSoup(wind_raw, 'html')

In [5]:
# Define the regex pattern
pattern = r'\)">(.*?)<td>'
result = re.findall(pattern, str(wind_raw.contents), re.IGNORECASE | re.DOTALL)
result[20]

'<path d="m50,0 -20,30 16,-3 -3,63 14,0 -3,-63 16,3 -20,-30z" fill="#000" stroke-width="0"></path></g></svg></td>'

In [6]:
wind_raw = wind_raw.find_all('td')[80:-42]

In [8]:
dates = []
wind_data = []
wind_dir_data = []
wind_gust_data = []
temperature_data = []
precipitation_data = []
cloud_cover_data = []

# Initialize a variable to keep track of the current index
current_index = 0

# Iterate through the data list
while current_index < len(wind_raw):
    # Extract the date and add it to the 'dates' list
    dates.append(wind_raw[current_index])
    current_index += 1

    # Extract data for wind, wind direction, wind gust, temperature, precipitation, and cloud cover
    wind_data.extend(wind_raw[current_index:current_index + 12])
    current_index += 12
    wind_dir_data.extend(wind_raw[current_index:current_index + 12])
    current_index += 12
    wind_gust_data.extend(wind_raw[current_index:current_index + 12])
    current_index += 12
    temperature_data.extend(wind_raw[current_index:current_index + 12])
    current_index += 12
    precipitation_data.extend(wind_raw[current_index:current_index + 12])
    current_index += 12
    cloud_cover_data.extend(wind_raw[current_index:current_index + 12])
    current_index += 12

# Now you have separate lists for each category


In [9]:
wind_data

[<td style="background-color:rgba(181,251,248,1)">7</td>,
 <td style="background-color:rgba(185,251,249,1)">7</td>,
 <td style="background-color:rgba(130,248,244,1)">8</td>,
 <td style="background-color:rgba(119,248,242,1)">9</td>,
 <td style="background-color:rgba(99,247,231,1)">9</td>,
 <td style="background-color:rgba(90,248,210,1)">10</td>,
 <td style="background-color:rgba(107,247,241,1)">9</td>,
 <td style="background-color:rgba(96,248,225,1)">9</td>,
 <td style="background-color:rgba(78,249,183,1)">10</td>,
 <td style="background-color:rgba(96,248,225,1)">9</td>,
 <td style="background-color:rgba(193,252,249,1)">7</td>,
 <td style="background-color:rgba(197,252,250,1)">7</td>,
 <td style="background-color:rgba(212,253,251,1)">6</td>,
 <td style="background-color:rgba(204,252,250,1)">6</td>,
 <td style="background-color:rgba(169,250,247,1)">7</td>,
 <td style="background-color:rgba(130,248,244,1)">8</td>,
 <td style="background-color:rgba(193,252,249,1)">7</td>,
 <td style="backg

In [10]:
# Create a list comprehension to extract dates
wind_data = [td.get_text() for td in wind_data]
wind_gust_data = [td.get_text() for td in wind_gust_data]
temperature_data = [td.get_text() for td in temperature_data]
precipitation_data = [td.get_text() for td in precipitation_data]
cloud_cover_data = [td.get_text() for td in cloud_cover_data]
dates = [element.find('b').get_text() if element.find('b') else None for element in dates]
wind_dir_data = [
    int(element.find('g')['transform'].split('(')[1].split(',')[0])
    if element.find('g')
    else None
    for element in wind_dir_data
]
# Create a list of datetime objects by parsing the date strings
dates = [datetime.strptime(date_text, '%d.%m.%Y') for date_text in dates]

In [11]:
# Initialize an empty list to store the result
dates_hours = []

# Iterate through the list of dates
for date in dates:
    # Generate 12 items per day, each with a label like "day1 2h," "day1 4h," and so on
    for hour_offset in range(1, 13):
        # Calculate the hour for this 2-hour interval
        hour = 2 * hour_offset
        # Create the label and append it to the result list
        dates_hours.append(date + pd.DateOffset(hours=hour))

In [12]:
# Create a DataFrame from the lists
df = pd.DataFrame({
    'Wind': wind_data,
    'Wind_Gust': wind_gust_data,
    'Temperature': temperature_data,
    'Precipitation': precipitation_data,
    'Cloud_Cover': cloud_cover_data,
    'Datetime': dates_hours,  # Repeat dates for each 2-hour interval
    'Wind_Direction': wind_dir_data
})

# Deal with nan values
df.loc[df['Precipitation'] == '\xa0', 'Precipitation'] = 0
df.loc[df['Wind'] == ' - ', 'Wind'] = np.nan
df.loc[df['Wind_Gust'] == ' - ', 'Wind_Gust'] = np.nan
df.loc[df['Temperature'] == ' - ', 'Temperature'] = np.nan
df.loc[df['Precipitation'] == ' - ', 'Precipitation'] = np.nan
df.loc[df['Cloud_Cover'] == '\xa0', 'Cloud_Cover'] = 0
df.loc[df['Cloud_Cover'] == ' - ', 'Cloud_Cover'] = np.nan

# Cast to correct types
df["Wind"] = df["Wind"].astype(float)
df["Wind_Gust"] = df["Wind_Gust"].astype(float)
df["Temperature"] = df["Temperature"].astype(float)
df["Precipitation"] = df["Precipitation"].astype(float)
df["Cloud_Cover"] = df["Cloud_Cover"].astype(float)

# Adjustment for wind direction
df['Wind_Direction'] = df['Wind_Direction'] - 180

# Extract Month
df['Month'] = df['Datetime'].dt.month
df['Hour'] = df['Datetime'].dt.hour

# Define wind direction bins
bins = [0, 45, 90, 135, 180, 225, 270, 315, 360]
labels = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW']

# Bin wind directions 
df['WindDirBin'] = pd.cut(df['Wind_Direction'], bins=bins, labels=labels)

df

Unnamed: 0,Wind,Wind_Gust,Temperature,Precipitation,Cloud_Cover,Datetime,Wind_Direction,Month,Hour,WindDirBin
0,7.0,12.0,2.0,0.0,97.0,2021-01-01 02:00:00,191.0,1,2,S
1,7.0,11.0,2.0,0.0,26.0,2021-01-01 04:00:00,184.0,1,4,S
2,8.0,12.0,2.0,0.0,100.0,2021-01-01 06:00:00,171.0,1,6,SE
3,9.0,13.0,2.0,0.0,100.0,2021-01-01 08:00:00,166.0,1,8,SE
4,9.0,14.0,2.0,0.0,100.0,2021-01-01 10:00:00,164.0,1,10,SE
...,...,...,...,...,...,...,...,...,...,...
12019,7.0,15.0,19.0,0.0,100.0,2023-10-02 16:00:00,234.0,10,16,SW
12020,8.0,17.0,19.0,0.2,100.0,2023-10-02 18:00:00,229.0,10,18,SW
12021,8.0,18.0,18.0,0.0,100.0,2023-10-02 20:00:00,232.0,10,20,SW
12022,8.0,19.0,18.0,0.0,100.0,2023-10-02 22:00:00,223.0,10,22,S


In [13]:
# Specify the target date
target_date = pd.to_datetime('2021-1-25')
end_date = pd.to_datetime('2021-1-27')

# Filter the DataFrame for the specific day
df[(df['Datetime'].dt.date > target_date.date() )& (df['Datetime'].dt.date < end_date.date())]

Unnamed: 0,Wind,Wind_Gust,Temperature,Precipitation,Cloud_Cover,Datetime,Wind_Direction,Month,Hour,WindDirBin
299,9.0,14.0,1.0,0.0,100.0,2021-01-26 00:00:00,272.0,1,0,W
300,12.0,16.0,2.0,0.0,100.0,2021-01-26 02:00:00,287.0,1,2,W
301,9.0,15.0,1.0,0.0,100.0,2021-01-26 04:00:00,280.0,1,4,W
302,11.0,17.0,1.0,0.0,100.0,2021-01-26 06:00:00,272.0,1,6,W
303,10.0,15.0,1.0,0.1,100.0,2021-01-26 08:00:00,283.0,1,8,W
304,8.0,14.0,1.0,0.1,92.0,2021-01-26 10:00:00,295.0,1,10,W
305,10.0,16.0,2.0,0.0,99.0,2021-01-26 12:00:00,309.0,1,12,W
306,13.0,18.0,2.0,0.0,98.0,2021-01-26 14:00:00,312.0,1,14,W
307,14.0,19.0,2.0,0.0,100.0,2021-01-26 16:00:00,308.0,1,16,W
308,14.0,21.0,2.0,0.0,100.0,2021-01-26 18:00:00,301.0,1,18,W


In [204]:
# Read the MySQL configuration from the JSON file
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

In [205]:
# Extract MySQL connection details
mysql_config = config.get('mysql', {})
username = mysql_config.get('username', 'default_username')
password = mysql_config.get('password', 'default_password')
host = mysql_config.get('host', 'localhost')
database_name = mysql_config.get('database_name', 'your_database')
table_name = 'historical_forecast'

In [206]:
# Create the MySQL database connection string
db_url = f"mysql+mysqlconnector://{username}:{password}@{host}/{database_name}"

# Create an SQLAlchemy engine
engine = create_engine(db_url)

# Use the engine to connect to the database
connection = engine.connect()

In [207]:
df.to_sql(table_name, connection, if_exists='replace', index=False)

12024