In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

import mlflow
import mlflow.sklearn
import mlflow.pyfunc

import pandas as pd
from sqlalchemy import create_engine

import json
import os
import datetime
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

BASE_DIR = os.getcwd()

# 1. Load Data 

In [8]:
class DataPipeline():
    def __init__(self, station):
        self.station = station
        self.get_config()

    def get_config(self):
        # Read the MySQL configuration from the JSON file
        with open(os.path.join(BASE_DIR, 'config.json'), 'r') as config_file:
            config = json.load(config_file)

        # Extract MySQL connection details
        mysql_config = config.get('mysql', {})
        username = mysql_config.get('username', 'default_username')
        password = mysql_config.get('password', 'default_password')
        host = mysql_config.get('host', 'localhost')
        database_name = mysql_config.get('database_name', 'your_database')

        # Create the MySQL database connection string
        self.db_url = f"mysql+mysqlconnector://{username}:{password}@{host}/{database_name}"
    
    def get_train_data(self):
        # Create an SQLAlchemy engine
        engine = create_engine(self.db_url)

        # Use the engine to connect to the database
        connection = engine.connect()

        # Specify the SQL query to retrieve data from a table
        query_forecast = f"SELECT * FROM forecast"
        query_measurments = f"SELECT * FROM measurments_{self.station}"

        # Use Pandas to read data from the database into a DataFrame
        self.df_forecast = pd.read_sql(query_forecast, connection)
        self.df_measurments = pd.read_sql(query_measurments, connection)

        connection.close()

    def transform(self):
        # Set the 'Time' column as the index
        self.df_measurments.set_index('Time', inplace=True)

        # Resample the data with a two-hour interval and apply mean aggregation
        self.df_measurments = self.df_measurments.resample('2H').mean()

        self.df_measurments.reset_index(inplace=True)

        df = pd.merge(left=self.df_forecast, right=self.df_measurments, on='Time', how='inner')

        df.dropna(inplace=True)

        # Close the database connections
        self.df = df

In [9]:
data_pipeline = DataPipeline(station='rewa')
data_pipeline.get_train_data()
data_pipeline.transform()
df = data_pipeline.df  
df

Unnamed: 0,Time,Month,Hour,WindForecast,GustForecast,WindDirForecast,Temperature,Precipitation,Cloudcover,Update_x,WindSpeed,WindGust,Temp,WindDir,Baro,Update_y
0,2021-01-25 00:00:00,1,0,9.00,14.0,306.00,2.00,0.3,100.0,2023-10-03,7.257500,8.537500,2.258333,240.083333,1023.425000,2023-10-13 18:27:01.000000256
1,2021-01-25 02:00:00,1,2,10.00,15.0,299.00,2.00,0.6,100.0,2023-10-03,10.365833,12.490833,2.866667,261.083333,1023.983333,2023-10-13 18:27:01.000000256
2,2021-01-25 04:00:00,1,4,11.00,17.0,297.00,2.00,0.2,100.0,2023-10-03,8.423333,10.462500,2.366667,237.916667,1024.683333,2023-10-13 18:27:01.000000256
3,2021-01-25 06:00:00,1,6,12.00,20.0,280.00,2.00,0.3,100.0,2023-10-03,10.480000,13.590000,1.875000,259.750000,1025.766667,2023-10-13 18:27:01.000000256
4,2021-01-25 08:00:00,1,8,12.00,20.0,282.00,2.00,0.1,100.0,2023-10-03,8.520000,10.383333,1.216667,246.250000,1026.475000,2023-10-13 18:27:01.000000256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11816,2023-10-12 14:00:00,10,14,21.00,273.5,30.00,14.30,0.0,7.5,2023-10-13,18.061667,26.714167,13.900000,235.166667,1002.650000,2023-10-13 18:27:01.000000256
11817,2023-10-12 16:00:00,10,16,20.30,272.0,27.90,13.15,0.0,5.0,2023-10-13,13.767500,22.077500,13.300000,230.000000,1003.533333,2023-10-13 18:27:01.000000256
11818,2023-10-12 18:00:00,10,18,18.95,270.5,26.15,12.15,0.0,11.5,2023-10-13,6.821667,11.954167,12.141667,208.916667,1004.741667,2023-10-13 18:27:01.000000256
11819,2023-10-12 20:00:00,10,20,16.45,271.5,23.90,11.90,0.0,51.0,2023-10-13,4.034167,8.066667,10.716667,189.166667,1005.791667,2023-10-13 18:27:01.000000256


# 2. EDA

# 3. Preprocessing

## 3.1 Label - Features Split

In [11]:
X = df[['Month', 'Hour', 'WindForecast', 'GustForecast',	
       'WindDirForecast', 'Temperature', 'Precipitation', 'Cloudcover']]
y = df['WindSpeed']

# 4. Training

In [14]:
def experiment_setup(experiment_name):
    try:
        id = mlflow.create_experiment(experiment_name)
    except:
        id = mlflow.get_experiment_by_name(experiment_name).experiment_id

    today = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')

    run_name = f'training_{today}'

    return id, run_name

# 4.1 XGBoost CV HPT x1

In [23]:
id, run_name = experiment_setup('xgb_hpt_cv_x1')
model_name = 'xgb_hpt_cv_x1'

with mlflow.start_run(experiment_id=id ,run_name=run_name) as run: 
   # Define param grid
   param_grid = {
                  'max_depth': [2, 3],
                  'learning_rate': [0.005,0.01, 0.1],
                  'n_estimators': [30, 50],
               }

   # Define kfold split
   k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

   # Grid search
   xgb_regressor = xgb.XGBRegressor()
   grid = GridSearchCV(xgb_regressor, param_grid, cv=k_fold)
   grid.fit(X, y)

   # Log best hyperparameters
   best_max_depth = grid.best_params_['max_depth']
   best_learning_rate = grid.best_params_['learning_rate']
   best_n_estimators = grid.best_params_['n_estimators']
   mlflow.log_param(f'best_max_depth', best_max_depth)
   mlflow.log_param(f'best_learning_rate', best_learning_rate)
   mlflow.log_param(f'best_n_estimators', best_n_estimators)

   # Create a model from best hyperparameters
   model = xgb.XGBRegressor(
                           max_depth=best_max_depth,
                           learning_rate=best_learning_rate,
                           n_estimators=best_n_estimators,
                           )

   # Run k-fold cross validation
   kfold_scores = cross_val_score(model, X, y, cv=k_fold)

   # Track metrics
   mlflow.log_metric(f"average_accuracy", kfold_scores.mean())
   mlflow.log_metric(f"std_accuracy", kfold_scores.std())

   # Fit the model 
   model.fit(X, y)

   # Register the model
   mlflow.register_model(
   f"runs:/{run.info.run_id}/sklearn-model", model_name
   )
   mlflow.sklearn.log_model(model, "sklearn-model")

Registered model 'xgb_hpt_cv_x1' already exists. Creating a new version of this model...
2023/10/14 16:02:44 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: xgb_hpt_cv_x1, version 1
Created version '1' of model 'xgb_hpt_cv_x1'.


# 5. Post-processing