In [1]:
# Import Required Libraries
import pandas as pd
from pycaret.regression import *
from sklearn.decomposition import PCA
import numpy as np

# Load Datasets (Replace with your actual data paths)
df_day = pd.read_csv('C:\VS CODE PROGRAMS\.vscode\PYTHON_AQI\city_day.csv')
df_hour = pd.read_csv('C:\VS CODE PROGRAMS\.vscode\PYTHON_AQI\city_hour.csv')

# Data Preprocessing: Handling Datetime and City Columns
df_day['Datetime'] = pd.to_datetime(df_day['Date'])  # Combine 'Date' column
df_hour['Datetime'] = pd.to_datetime(df_hour['Datetime'])  # Convert to datetime

# Set 'Datetime' as Index for both DataFrames
df_day.set_index('Datetime', inplace=True)
df_hour.set_index('Datetime', inplace=True)

# Ensure both DataFrames have the same datetime frequency
df_day = df_day.resample('H').ffill()  # Resample to hourly, forward-filling missing values

# Merge Datasets (on 'City' and 'Datetime' index)
df_merged = pd.merge(df_day, df_hour, left_index=True, right_index=True, on='City', suffixes=('_day', '_hour'))

# Drop redundant columns, including original Date column from df_day
df_merged = df_merged.drop(columns=['Date_day', 'PM2.5_day', 'PM10_day', 'NO_day', 'NO2_day', 'NOx_day',
       'NH3_day', 'CO_day', 'SO2_day', 'O3_day', 'Benzene_day', 'Toluene_day',
       'Xylene_day'])

# Drop rows with missing values
df_merged.dropna(inplace = True)

# Feature Engineering - Create synthetic HCl values from SO₂ levels
df_merged['HCl'] = df_merged['SO2_hour'] * 0.125  # 10-15% ratio

# PCA and PyCaret Setup
pca = PCA(n_components=0.95)
numerical_features = df_merged.select_dtypes(include=np.number).drop('HCl', axis=1)
pca_features = pca.fit_transform(numerical_features)

pca_df = pd.DataFrame(pca_features, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
final_df = pd.concat([pca_df, df_merged['HCl'].reset_index(drop=True)], axis=1)

# PyCaret Setup and Run
reg_experiment = setup(
    data = final_df,
    target = 'HCl',
    train_size = 0.8,
    session_id = 42,
    normalize = True,
    transformation = True,
    feature_interaction = True,
    feature_ratio = True,
    remove_multicollinearity=True,
    multicollinearity_threshold = 0.90
)

best_model = compare_models(
    include=[
        'rf', 'lr', 'gbr', 'xgboost', 'lightgbm',
        'et', 'dt', 'knn'
    ],
    sort = 'RMSE',
    n_select = 3
)

# Get and Finalize the Best Model
if isinstance(best_model, list):
    best_model = best_model[0]  # Take the first model from the list

final_model = finalize_model(best_model)
print(final_model)

# Plotting and Saving (optional)
plot_model(final_model, plot='residuals')
plot_model(final_model, plot='feature')
save_model(final_model, 'hcl_prediction_pipeline')

# Prediction (optional)
predict_model(final_model)


  df_day = pd.read_csv('C:\VS CODE PROGRAMS\.vscode\PYTHON_AQI\city_day.csv')
  df_hour = pd.read_csv('C:\VS CODE PROGRAMS\.vscode\PYTHON_AQI\city_hour.csv')


RuntimeError: ('Pycaret only supports python 3.9, 3.10, 3.11. Your actual Python version: ', sys.version_info(major=3, minor=12, micro=5, releaselevel='final', serial=0), 'Please DOWNGRADE your Python version.')