In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pycaret
from pycaret.classification import *

In [16]:
data = pd.read_csv('train.csv')
data

Unnamed: 0,DATETIMEDATA,PM25,O3,WS,TEMP,RH,WD
0,2024-01-01 00:00:00,16.5,16.000000,1,27.0,79.0,115.0
1,2024-01-01 01:00:00,33.6,11.326513,1,27.0,82.0,97.0
2,2024-01-01 02:00:00,34.2,11.326513,1,26.0,84.0,96.0
3,2024-01-01 03:00:00,20.1,11.326513,1,26.0,82.0,108.0
4,2024-01-01 04:00:00,16.5,1.000000,1,27.0,79.0,106.0
...,...,...,...,...,...,...,...
1411,2024-02-28 19:00:00,23.6,11.326513,1,29.0,64.0,184.0
1412,2024-02-28 20:00:00,25.2,11.326513,1,29.0,64.0,183.0
1413,2024-02-28 21:00:00,23.2,1.000000,1,29.0,64.0,159.0
1414,2024-02-28 22:00:00,24.7,3.000000,1,28.0,65.0,134.0


In [1]:
import pandas as pd
from pycaret.regression import *

# Read the training data
train_data = pd.read_csv('train.csv')


# Convert 'DATETIMEDATA' column to datetime format
train_data['DATETIMEDATA'] = pd.to_datetime(train_data['DATETIMEDATA'])

# Set up the PyCaret regression environment
regression_setup = setup(train_data, target='O3', session_id=123, date_features=['DATETIMEDATA'], normalize=True)
best = compare_models()
# Create a Random Forest Regressor model
rf_model = create_model('et')

# Fine-tune the Random Forest Regressor model
tuned_rf_model = tune_model(rf_model)

# Ensemble the tuned Random Forest Regressor model
bagged_model = ensemble_model(tuned_rf_model, n_estimators=20)

# Get the last prediction time
last_prediction_time = train_data['DATETIMEDATA'].max()

# Generate datetime values for every hour of each day from the last prediction time to the end of the next week
start_date = last_prediction_time + pd.DateOffset(hours=1)  # Start date for new predictions
end_date = start_date + pd.DateOffset(days=7, hours=23)  # End date for next week
next_week_hours = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a DataFrame with 'DATETIMEDATA' column for every hour of each day within the next week
next_week_data = pd.DataFrame({'DATETIMEDATA': next_week_hours})

# Add columns for 'O3', 'WS', 'TEMP', 'RH', 'WD' with placeholder values of 0
next_week_data['PM25'] = 0
next_week_data['WS'] = 0
next_week_data['TEMP'] = 0
next_week_data['RH'] = 0
next_week_data['WD'] = 0

# Predict PM25 for every hour of each day within the next week
model_predictions = predict_model(bagged_model, data=next_week_data)

mpdf = pd.DataFrame(model_predictions, columns=['DATETIMEDATA', 'prediction_label'])

mpdf.rename(columns={'prediction_label': 'O3'}, inplace=True)

print(mpdf)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,O3
2,Target type,Regression
3,Original data shape,"(1416, 7)"
4,Transformed data shape,"(1416, 9)"
5,Transformed train set shape,"(991, 9)"
6,Transformed test set shape,"(425, 9)"
7,Numeric features,5
8,Date features,1
9,Preprocess,True


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,6.1798,93.1009,9.3282,0.1879,0.7957,1.594,0.026
rf,Random Forest Regressor,5.9642,97.4736,9.4738,0.1511,0.7839,1.5563,0.049
et,Extra Trees Regressor,5.7819,99.6964,9.5197,0.1475,0.785,1.5,0.033
lightgbm,Light Gradient Boosting Machine,6.4066,95.3106,9.525,0.1393,0.8309,1.5892,0.244
knn,K Neighbors Regressor,6.429,98.7821,9.6001,0.1259,0.7967,1.5392,0.012
br,Bayesian Ridge,7.0,104.8626,9.933,0.0749,0.8381,1.8087,0.006
lr,Linear Regression,7.0519,104.8638,9.9329,0.0748,0.8351,1.7941,0.211
ridge,Ridge Regression,7.0506,104.8601,9.9327,0.0748,0.8352,1.7943,0.007
lar,Least Angle Regression,7.0519,104.8638,9.9329,0.0748,0.8351,1.7941,0.006
omp,Orthogonal Matching Pursuit,7.0422,105.0862,9.9525,0.0695,0.8374,1.7996,0.006


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.3443,58.8516,7.6715,0.4044,0.7198,1.2767
1,7.6409,237.9687,15.4262,-0.5029,0.9027,1.863
2,5.9568,122.2189,11.0553,0.1819,0.8349,1.6843
3,6.799,213.0352,14.5957,0.1636,0.8,1.4246
4,4.4966,40.569,6.3694,0.4208,0.6344,0.9845
5,5.374,55.8588,7.4739,0.2697,0.909,2.2004
6,5.6631,55.4458,7.4462,0.2075,0.8097,1.5907
7,5.5396,90.8983,9.5341,-0.1141,0.7269,1.2722
8,5.248,63.3396,7.9586,0.1579,0.7192,1.1319
9,5.7565,58.7777,7.6667,0.2867,0.7938,1.5716


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.0202,62.4878,7.9049,0.3676,0.7611,1.4681
1,6.8869,141.9764,11.9154,0.1033,0.8447,1.8446
2,5.9956,125.6746,11.2105,0.1587,0.8337,1.8095
3,7.143,205.7303,14.3433,0.1923,0.8124,1.5694
4,5.0752,44.6452,6.6817,0.3626,0.7102,1.2839
5,5.5683,56.622,7.5248,0.2598,0.898,2.2577
6,5.7776,50.832,7.1297,0.2735,0.8236,1.837
7,5.0861,52.186,7.224,0.3604,0.6874,1.1856
8,5.3334,51.8659,7.2018,0.3104,0.6779,1.1612
9,5.8839,57.0864,7.5556,0.3072,0.7855,1.6391


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.2185,63.6278,7.9767,0.356,0.7634,1.4813
1,6.9223,141.8957,11.912,0.1038,0.8519,1.864
2,6.1577,130.6586,11.4306,0.1254,0.8506,1.8912
3,7.4093,214.1937,14.6354,0.159,0.8238,1.5982
4,5.3526,48.6819,6.9772,0.305,0.7272,1.3517
5,5.848,60.5068,7.7786,0.209,0.911,2.3178
6,5.9275,52.1744,7.2232,0.2543,0.8316,1.874
7,5.2046,54.1259,7.357,0.3366,0.6872,1.1771
8,5.4914,53.7275,7.3299,0.2857,0.688,1.2005
9,6.0679,60.3554,7.7689,0.2675,0.7987,1.6963


  DATETIMEDATA         O3
0   2024-02-29  12.577879
1   2024-03-01   9.329538
2   2024-03-02   9.292464
3   2024-03-03   9.786587
4   2024-03-04  10.794685
5   2024-03-05  11.083334
6   2024-03-06  10.968511
7   2024-03-07  11.484497


In [2]:

mpdf.to_csv('predict_O3.csv', index=False)

In [5]:
save_model(bagged_model,'predict_O3')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('date_feature_extractor',
                  TransformerWrapper(include=['DATETIMEDATA'],
                                     transformer=ExtractDateTimeFeatures())),
                 ('numerical_imputer',
                  TransformerWrapper(include=['PM25', 'WS', 'TEMP', 'RH', 'WD'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('normalize', TransformerWrapper(transformer=StandardScaler())),
                 ('trained_model',
                  BaggingRegressor(estimator=ExtraTreesRegressor(bootstrap=True,
                                                                 max_depth=9,
                                                                 min_impurity_decrease=0.1,
                                     

In [2]:
evaluate_model(rf_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…