In [None]:
!pip install openmeteo-requests
!pip install requests-cache retry-requests numpy pandas

In [None]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 22.8101,
	"longitude": 86.2634,
	"start_date": "2024-02-15",
	"end_date": "2024-03-07",
	"hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "surface_pressure", "cloud_cover", "wind_speed_100m", "wind_direction_100m", "is_day", "sunshine_duration", "direct_radiation", "diffuse_radiation"],
	"timezone": "auto"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(2).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(3).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(4).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(5).ValuesAsNumpy()
hourly_wind_speed_100m = hourly.Variables(6).ValuesAsNumpy()
hourly_wind_direction_100m = hourly.Variables(7).ValuesAsNumpy()
hourly_is_day = hourly.Variables(8).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(9).ValuesAsNumpy()
hourly_direct_radiation = hourly.Variables(10).ValuesAsNumpy()
hourly_diffuse_radiation = hourly.Variables(11).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["dew_point_2m"] = hourly_dew_point_2m
hourly_data["precipitation"] = hourly_precipitation
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
hourly_data["is_day"] = hourly_is_day
hourly_data["sunshine_duration"] = hourly_sunshine_duration
hourly_data["direct_radiation"] = hourly_direct_radiation
hourly_data["diffuse_radiation"] = hourly_diffuse_radiation

hourly_dataframe = pd.DataFrame(data = hourly_data)
print(hourly_dataframe)
# import openmeteo_requests

# import requests_cache
# import pandas as pd
# from retry_requests import retry

# # Setup the Open-Meteo API client with cache and retry on error
# cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
# retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
# openmeteo = openmeteo_requests.Client(session = retry_session)

# # Make sure all required weather variables are listed here
# # The order of variables in hourly or daily is important to assign them correctly below
# url = "https://archive-api.open-meteo.com/v1/archive"
# params = {
# 	"latitude": 22.8101,
# 	"longitude": 86.2634,
# 	"start_date": "2024-02-15",
# 	"end_date": "2024-03-07",
# 	"hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "surface_pressure", "cloud_cover", "wind_speed_100m", "wind_direction_100m", "is_day", "sunshine_duration", "direct_radiation", "diffuse_radiation"],
# 	"timezone": "auto"
# }
# responses = openmeteo.weather_api(url, params=params)

# # Process first location. Add a for-loop for multiple locations or weather models
# response = responses[0]
# print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
# print(f"Elevation {response.Elevation()} m asl")
# print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
# print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# # Process hourly data. The order of variables needs to be the same as requested.
# hourly = response.Hourly()
# hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
# hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
# hourly_dew_point_2m = hourly.Variables(2).ValuesAsNumpy()
# hourly_precipitation = hourly.Variables(3).ValuesAsNumpy()
# hourly_surface_pressure = hourly.Variables(4).ValuesAsNumpy()
# hourly_cloud_cover = hourly.Variables(5).ValuesAsNumpy()
# hourly_wind_speed_100m = hourly.Variables(6).ValuesAsNumpy()
# hourly_wind_direction_100m = hourly.Variables(7).ValuesAsNumpy()
# hourly_is_day = hourly.Variables(8).ValuesAsNumpy()
# hourly_sunshine_duration = hourly.Variables(9).ValuesAsNumpy()
# hourly_direct_radiation = hourly.Variables(10).ValuesAsNumpy()
# hourly_diffuse_radiation = hourly.Variables(11).ValuesAsNumpy()

# hourly_data = {"date": pd.date_range(
# 	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
# 	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
# 	freq = pd.Timedelta(seconds = hourly.Interval()),
# 	inclusive = "left"
# )}
# hourly_data["temperature_2m"] = hourly_temperature_2m
# hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
# hourly_data["dew_point_2m"] = hourly_dew_point_2m
# hourly_data["precipitation"] = hourly_precipitation
# hourly_data["surface_pressure"] = hourly_surface_pressure
# hourly_data["cloud_cover"] = hourly_cloud_cover
# hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
# hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
# hourly_data["is_day"] = hourly_is_day
# hourly_data["sunshine_duration"] = hourly_sunshine_duration
# hourly_data["direct_radiation"] = hourly_direct_radiation
# hourly_data["diffuse_radiation"] = hourly_diffuse_radiation

# hourly_dataframe = pd.DataFrame(data = hourly_data)
# print(hourly_dataframe)


Coordinates 22.811948776245117°N 86.25°E
Elevation 135.0 m asl
Timezone b'Asia/Kolkata' b'IST'
Timezone difference to GMT+0 19800 s
                         date  temperature_2m  relative_humidity_2m  \
0   2024-02-14 18:30:00+00:00       18.407499             92.138802   
1   2024-02-14 19:30:00+00:00       19.757500             85.508797   
2   2024-02-14 20:30:00+00:00       18.057501             91.826599   
3   2024-02-14 21:30:00+00:00       19.057501             88.470024   
4   2024-02-14 22:30:00+00:00       16.707500             95.947205   
..                        ...             ...                   ...   
523 2024-03-07 13:30:00+00:00       23.607500             77.812508   
524 2024-03-07 14:30:00+00:00       22.957500             78.202972   
525 2024-03-07 15:30:00+00:00       21.457500             84.882576   
526 2024-03-07 16:30:00+00:00       20.957500             84.829567   
527 2024-03-07 17:30:00+00:00       21.657499             81.012062   

     dew_point_

In [None]:
# # Change data types accordingly
hourly_dataframe['date'] = pd.to_datetime(hourly_dataframe['date'])
# # hourly_dataframe['date_only'] = hourly_dataframe['date'].dt.date
# # hourly_dataframe['time_only'] = hourly_dataframe['date'].dt.time

# # Displaying the DataFrame with the newly added columns
# print(hourly_dataframe) # Convert 'date' column to datetime format

# # Convert numerical attributes to float or int
# numerical_attributes = ['temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'precipitation',
#                         'surface_pressure', 'wind_speed_100m', 'wind_direction_100m', 'sunshine_duration']
# hourly_dataframe[numerical_attributes] = hourly_dataframe[numerical_attributes].astype(float)  # Convert numerical attributes to float

# # Convert boolean attribute to bool
# hourly_dataframe['is_day'] = hourly_dataframe['is_day'].astype(bool)

# # Ensure that 'cloud_cover' attribute remains unchanged, as it is likely the target variable

# # Print the data types of attributes after conversion

import pandas as pd

# Assuming hourly_data is a pandas DataFrame
hourly_data["temperature_2m"] = pd.to_numeric(hourly_data["temperature_2m"])
hourly_data["relative_humidity_2m"] = pd.to_numeric(hourly_data["relative_humidity_2m"])
hourly_data["dew_point_2m"] = pd.to_numeric(hourly_data["dew_point_2m"])
hourly_data["precipitation"] = pd.to_numeric(hourly_data["precipitation"])
hourly_data["surface_pressure"] = pd.to_numeric(hourly_data["surface_pressure"])
hourly_data["cloud_cover"] = pd.to_numeric(hourly_data["cloud_cover"])
hourly_data["wind_speed_100m"] = pd.to_numeric(hourly_data["wind_speed_100m"])
hourly_data["wind_direction_100m"] = pd.to_numeric(hourly_data["wind_direction_100m"])
hourly_data["is_day"] = hourly_data["is_day"].astype(bool)  # Assuming it's originally stored as a boolean
hourly_data["sunshine_duration"] = pd.to_numeric(hourly_data["sunshine_duration"])
hourly_data["direct_radiation"] = pd.to_numeric(hourly_data["direct_radiation"])
hourly_data["diffuse_radiation"] = pd.to_numeric(hourly_data["diffuse_radiation"])
print(hourly_dataframe.dtypes)

date                    datetime64[ns, UTC]
temperature_2m                      float32
relative_humidity_2m                float32
dew_point_2m                        float32
precipitation                       float32
surface_pressure                    float32
cloud_cover                         float32
wind_speed_100m                     float32
wind_direction_100m                 float32
is_day                              float32
sunshine_duration                   float32
direct_radiation                    float32
diffuse_radiation                   float32
dtype: object


# Without preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming 'hourly_dataframe' is your DataFrame with the necessary columns

# Select features and target variable
features = ['temperature_2m', 'relative_humidity_2m', 'dew_point_2m', 'precipitation',
            'surface_pressure', 'wind_speed_100m', 'wind_direction_100m', 'sunshine_duration','direct_radiation','diffuse_radiation']
target = 'cloud_cover'

X = hourly_dataframe[features]
y = hourly_dataframe[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

# Choose a model and train it
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
# Calculate MAE
mae = np.mean(np.abs(y_test - predictions))
# Calculate RSE
rse = mse / np.var(y_test)
# Calculate RMSE
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("mae:",mae)
print("rse:",rse)
print("rmse:",rmse)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(feature, ':', coef)
print("Intercept:")
print(model.intercept_)


Mean Squared Error: 195.77444
mae: 10.697839
rse: 0.5241146
rmse: 13.991942
Coefficients:
temperature_2m : 2.6245916
relative_humidity_2m : 0.4823458
dew_point_2m : -0.7531799
precipitation : 19.641594
surface_pressure : 0.12740988
wind_speed_100m : 0.38813043
wind_direction_100m : -0.019238045
sunshine_duration : -0.0025914013
direct_radiation : -0.047477826
diffuse_radiation : 0.17429824
Intercept:
-193.97754


In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Ridge Regression model
ridge_model = Lasso(alpha=1)  # You can adjust the regularization strength (alpha) as needed

# Train the model
ridge_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_ridge = ridge_model.predict(X_test)

# Evaluate the model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print("Mean Squared Error (Ridge):", mse_ridge)
print("R-squared Score (Ridge):", r2_ridge)
print("Coefficients:")
for feature, coef in zip(X.columns, ridge_model.coef_):
    print(feature, ':', coef)
print("Intercept:")
print(ridge_model.intercept_)

Mean Squared Error (Ridge): 264.0563
R-squared Score (Ridge): 0.2930856619544654
Coefficients:
temperature_2m : 1.8491389
relative_humidity_2m : 0.32967922
dew_point_2m : 0.0
precipitation : 0.5215327
surface_pressure : 0.019990621
wind_speed_100m : 0.45151827
wind_direction_100m : -0.012415821
sunshine_duration : -0.0028961035
direct_radiation : -0.045497846
diffuse_radiation : 0.17885716
Intercept:
-71.90535


# Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
data=hourly_dataframe
# Load the dataset
#data = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with the actual filename
data=data.drop(columns=['date'])
# Calculate correlation between features and target variable
correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.1  # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold]
high_corr_features=high_corr_features.drop('cloud_cover')
high_corr_features = high_corr_features.index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()

# # Prepare the data for imputation
y = data['cloud_cover']
X = data.drop(columns=['cloud_cover'])


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for high correlation features using IterativeImputer
high_corr_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=20, random_state=42)),  # IterativeImputer for high correlation features
    # Add more preprocessing steps if necessary
])

# Define preprocessing steps for low correlation features using KNNImputer
low_corr_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # KNNImputer for low correlation features
    # Add more preprocessing steps if necessary
])

# Define column transformer to apply different preprocessing steps to different feature groups
preprocessor = ColumnTransformer(
    transformers=[
        ('high_corr', high_corr_transformer, high_corr_features)
        ,('low_corr', low_corr_transformer, low_corr_features)
    ])

preprocessor.fit(X_train,y_train)
# Define the model
model = RandomForestRegressor()

# Create a pipeline with preprocessing and modeling steps
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Train the model
pipeline.fit(X_train, y_train)
#model.fit(X_train,y_train)

# Make predictions
predictions = pipeline.predict(X_test)
#predictions=model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
# Calculate MAE
mae = np.mean(np.abs(y_test - predictions))
# Calculate RSE
rse = mse / np.var(y_test)
# Calculate RMSE
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("mae:",mae)
print("rse:",rse)
print("rmse:",rmse)


Mean Squared Error: 282.6816754557746
mae: 11.932528605826622
rse: 0.6583960277041276
rmse: 16.813139964199863


# LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional,Dropout
from keras.optimizers import Adam
data=hourly_dataframe
data=data.drop(columns=['date'])
# Calculate correlation between features and target variable
correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.2  # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()

# Define preprocessing steps for high correlation features using IterativeImputer
high_corr_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=42)),  # IterativeImputer for high correlation features
    # Add more preprocessing steps if necessary
])

# Define preprocessing steps for low correlation features using KNNImputer
low_corr_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # KNNImputer for low correlation features
    # Add more preprocessing steps if necessary
])

# Define column transformer to apply different preprocessing steps to different feature groups
preprocessor = ColumnTransformer(
    transformers=[
        ('high_corr', high_corr_transformer, high_corr_features),
        ('low_corr', low_corr_transformer, low_corr_features)
    ])

# Apply preprocessing steps to the data
data_preprocessed = preprocessor.fit_transform(data)
# Get the feature names of the transformed data
feature_names_out = preprocessor.get_feature_names_out()

# Print the feature names
print(feature_names_out)
#data_preprocessed= data.values.tolist()

# Rearrange data into time series format
time_steps = 5  # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][0])
print(y)

X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets (80%-20% and 90%-10%)
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_90, X_test_10, y_train_90, y_test_10 = train_test_split(X, y, test_size=0.1, random_state=42)

# Define the bidirectional LSTM model
def create_bidirectional_lstm_model():
    model = Sequential()
    model.add(Bidirectional(LSTM(units=64, return_sequences=True), input_shape=(X_train_80.shape[1], X_train_80.shape[2])))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(units=64)))
    model.add(Dropout(0.3))
    model.add(Dense(1))
    return model

# Compile the model
model = create_bidirectional_lstm_model()
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model (80%-20% split)
model.fit(X_train_80, y_train_80, epochs=50, batch_size=32, verbose=1, validation_data=(X_test_20, y_test_20))

# Train the model (90%-10% split)
model.fit(X_train_90, y_train_90, epochs=50, batch_size=32, verbose=1, validation_data=(X_test_10, y_test_10))

# Make predictions
predictions_20 = model.predict(X_test_20)
predictions_10 = model.predict(X_test_10)





# Evaluate the model
mse = mean_squared_error(y_test_20, predictions_20)
# Calculate MAE
mae = np.mean(np.abs(y_test_20 - predictions_20))
# Calculate RSE
rse = mse / np.var(y_test_20)
# Calculate RMSE
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("mae:",mae)
print("rse:",rse)
print("rmse:",rmse)


['high_corr__cloud_cover' 'high_corr__precipitation'
 'high_corr__diffuse_radiation' 'low_corr__dew_point_2m'
 'low_corr__is_day' 'low_corr__temperature_2m' 'low_corr__wind_speed_100m'
 'low_corr__direct_radiation' 'low_corr__sunshine_duration'
 'low_corr__surface_pressure' 'low_corr__relative_humidity_2m'
 'low_corr__wind_direction_100m']
[15.0, 18.0, 67.5, 60.300003, 82.200005, 19.5, 46.5, 59.100002, 13.500001, 15.6, 20.400002, 32.4, 13.200001, 13.200001, 7.8, 1.8000001, 0.6, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.5, 11.4, 6.6000004, 16.8, 5.4, 4.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.8000001, 4.2000003, 3.6, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.6, 4.8, 6.6000004, 7.2000003, 0.90000004, 0.0, 76.5, 65.7, 68.1, 31.8, 6.6000004, 9.6, 4.8, 11.400001, 21.0, 9.6, 4.8, 3.600

In [None]:

for i in range (len(y_test_10)):
   print(y_test_10[i]," ",predictions_10[i])

# Better LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

# Define preprocessing function
def preprocess_data(data):
    correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)
    correlation_threshold = 0.1
    high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
    low_corr_features = correlation[correlation < correlation_threshold].index.tolist()

    high_corr_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(max_iter=10, random_state=42))
    ])

    low_corr_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=3))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('high_corr', high_corr_transformer, high_corr_features),
            ('low_corr', low_corr_transformer, low_corr_features)
        ])

    data_preprocessed = preprocessor.fit_transform(data)
    return data_preprocessed, preprocessor.get_feature_names_out()

# Define function to create LSTM model
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=50, return_sequences=True), input_shape=input_shape))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(units=50)))
    model.add(Dropout(0.3))
    model.add(Dense(units=2))
    return model

# Define early stopping callback
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
data=hourly_dataframe
data=data.drop(columns=['date'])
# Load and preprocess the data
data_preprocessed, feature_names_out = preprocess_data(data)

# Rearrange data into time series format
time_steps = 5  # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][0])
X = np.array(X)
y = np.array(y)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape data for LSTM input
# X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
# X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Create and compile the model
model = create_lstm_model(input_shape=(X_train.shape[1], X_train.shape[2]))
model.compile(optimizer=Adam(lr=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stopping])
predictions= model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
# Calculate MAE
mae = np.mean(np.abs(y_test - predictions))
# Calculate RSE
rse = mse / np.var(y_test)
# Calculate RMSE
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("mae:",mae)
print("rse:",rse)
print("rmse:",rmse)





Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100


ValueError: y_true and y_pred have different number of output (1!=2)

In [None]:
y_pred=model.predict(X_test)
for i in range (len(y_test)):
  print(f"{y_test[i]}   {y_pred[i]}")

# SVM

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Load data
data = hourly_dataframe  # Assuming hourly_dataframe is your dataset

# Calculate correlation between features and target variable
correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.1  # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()

# Define preprocessing steps for high correlation features using IterativeImputer
high_corr_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=42)),  # IterativeImputer for high correlation features
    # Add more preprocessing steps if necessary
])

# Define preprocessing steps for low correlation features using KNNImputer
low_corr_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),  # KNNImputer for low correlation features
    # Add more preprocessing steps if necessary
])

# Define column transformer to apply different preprocessing steps to different feature groups
preprocessor = ColumnTransformer(
    transformers=[
        ('high_corr', high_corr_transformer, high_corr_features),
        ('low_corr', low_corr_transformer, low_corr_features)
    ])

# Apply preprocessing steps to the data
data_preprocessed = preprocessor.fit_transform(data)

# Get the feature names of the transformed data
feature_names_out = preprocessor.get_feature_names_out()

# Rearrange data into time series format
time_steps = 5  # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][0])

X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets (80%-20% and 90%-10%)
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_90, X_test_10, y_train_90, y_test_10 = train_test_split(X, y, test_size=0.1, random_state=42)

# Scale the features
# scaler = StandardScaler()
# X_train_80_scaled = scaler.fit_transform(X_train_80.reshape(-1, X_train_80.shape[-1])).reshape(X_train_80.shape)
# X_test_20_scaled = scaler.transform(X_test_20.reshape(-1, X_test_20.shape[-1])).reshape(X_test_20.shape)
# X_train_90_scaled = scaler.fit_transform(X_train_90.reshape(-1, X_train_90.shape[-1])).reshape(X_train_90.shape)
# X_test_10_scaled = scaler.transform(X_test_10.reshape(-1, X_test_10.shape[-1])).reshape(X_test_10.shape)

# Define and train the Support Vector Regressor (SVR) model (80%-20% split)
svr_model_80 = SVR(kernel='rbf')  # RBF kernel is commonly used
svr_model_80.fit(X_train_80.reshape(X_train_80.shape[0], -1), y_train_80)

# Define and train the Support Vector Regressor (SVR) model (90%-10% split)
svr_model_90 = SVR(kernel='rbf')  # RBF kernel is commonly used
svr_model_90.fit(X_train_90.reshape(X_train_90.shape[0], -1), y_train_90)

# Make predictions
predictions_20 = svr_model_80.predict(X_test_20.reshape(X_test_20.shape[0], -1))
predictions_10 = svr_model_90.predict(X_test_10.reshape(X_test_10.shape[0], -1))



# Calculate Mean Squared Error for both splits
mse_20 = mean_squared_error(y_test_20, predictions_20)
mse_10 = mean_squared_error(y_test_10, predictions_10)

print("Mean Squared Error (80%-20% split):", mse_20)
print("Mean Squared Error (90%-10% split):", mse_10)



Mean Squared Error (80%-20% split): 414.8126584916529
Mean Squared Error (90%-10% split): 432.38880238220963


  correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)


# Bradient boosting

In [None]:
import pandas as pd
import requests_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import openmeteo_requests
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define retry and cache mechanisms for API requests
retry_strategy = Retry(
    total=5,
    backoff_factor=0.2,
    status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
cache_session.mount("https://", adapter)

# Initialize Open-Meteo API client
openmeteo = openmeteo_requests.Client(session=cache_session)

# Make API request
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 22.8101,
    "longitude": 86.2634,
    "start_date": "2024-02-15",
    "end_date": "2024-03-07",
    "hourly": ["temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation",
               "surface_pressure", "cloud_cover", "wind_speed_100m", "wind_direction_100m",
               "is_day", "sunshine_duration", "direct_radiation", "diffuse_radiation"],
    "timezone": "auto"
}
response = openmeteo.weather_api(url, params=params)[0]

# Extract data
hourly = response.Hourly()
hourly_data = {
    # "date": pd.date_range(
    #     start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
    #     end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
    #     freq=pd.Timedelta(seconds=hourly.Interval()),
    #     closed="left"
    # ),
    "temperature_2m": hourly.Variables(0).ValuesAsNumpy(),
    "relative_humidity_2m": hourly.Variables(1).ValuesAsNumpy(),
    "dew_point_2m": hourly.Variables(2).ValuesAsNumpy(),
    "precipitation": hourly.Variables(3).ValuesAsNumpy(),
    "surface_pressure": hourly.Variables(4).ValuesAsNumpy(),
    "cloud_cover": hourly.Variables(5).ValuesAsNumpy(),
    "wind_speed_100m": hourly.Variables(6).ValuesAsNumpy(),
    "wind_direction_100m": hourly.Variables(7).ValuesAsNumpy(),
    "is_day": hourly.Variables(8).ValuesAsNumpy(),
    "sunshine_duration": hourly.Variables(9).ValuesAsNumpy(),
    "direct_radiation": hourly.Variables(10).ValuesAsNumpy(),
    "diffuse_radiation": hourly.Variables(11).ValuesAsNumpy()
}

# Create DataFrame
hourly_dataframe = pd.DataFrame(hourly_data)

# Convert data types
#hourly_dataframe["date"] = pd.to_datetime(hourly_dataframe["date"])
hourly_dataframe["is_day"] = hourly_dataframe["is_day"].astype(bool)
hourly_dataframe = hourly_dataframe.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values
hourly_dataframe.dropna(inplace=True)

# Define features and target variable
X = hourly_dataframe.drop(columns=["cloud_cover"])
y = hourly_dataframe["cloud_cover"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Gradient Boosting Regressor model
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

# Make predictions
y_pred = gbr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 351.17054203170875


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize Gradient Boosting Regressor
gbr = GradientBoostingRegressor()

# Perform Grid Search CV
grid_search = GridSearchCV(gbr, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best model
best_gbr = grid_search.best_estimator_

# Train the best model
best_gbr.fit(X_train, y_train)

# Predict on the testing set
y_pred = best_gbr.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
# Calculate MAE
mae = np.mean(np.abs(y_test - y_pred))
# Calculate RSE
rse = mse / np.var(y_test)
# Calculate RMSE
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("mae:",mae)
print("rse:",rse)
print("rmse:",rmse)


Mean Squared Error: 267.13530591176016
mae: 12.132776200774352
rse: 0.6221868608506471
rmse: 16.34427440762545


# Only correlated features

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional,Dropout
from keras.optimizers import Adam
data=hourly_dataframe

# Calculate correlation between features and target variable
correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.1 # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()


# Define preprocessing steps for high correlation features using IterativeImputer
high_corr_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=42)),  # IterativeImputer for high correlation features
    # Add more preprocessing steps if necessary
])

preprocessor = ColumnTransformer(
    transformers=[
        ('high_corr', high_corr_transformer, high_corr_features),
        ('low_corr', 'drop', low_corr_features)  # Drop low correlated features
    ],
    remainder='passthrough'  # Include any remaining columns not specified in transformers
)
# Apply preprocessing steps to the data
data_preprocessed = preprocessor.fit_transform(data)
# Get the feature names of the transformed data
feature_names_out = preprocessor.get_feature_names_out()

# Print the feature names
print(feature_names_out)


# Rearrange data into time series format
time_steps = 5  # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][0])
print(y)

X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets (80%-20% and 90%-10%)
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the bidirectional LSTM model
def create_bidirectional_lstm_model():
    model = Sequential()
    model.add(Bidirectional(LSTM(units=32, return_sequences=True), input_shape=(X_train_80.shape[1], X_train_80.shape[2])))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(units=32)))
    model.add(Dropout(0.3))
    model.add(Dense(1))
    return model

# Compile the model
model = create_bidirectional_lstm_model()
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model (80%-20% split)
model.fit(X_train_80, y_train_80, epochs=50, batch_size=32, verbose=1, validation_data=(X_test_20, y_test_20))
model.save("your_model_name.h5")
#To load tf.keras.models.load_model()

# Make predictions
predictions_20 = model.predict(X_test_20)


# Calculate Mean Squared Error for both splits
mse_20 = mean_squared_error(y_test_20, predictions_20)

print("Mean Squared Error (80%-20% split):", mse_20)


['remainder__temperature_2m' 'remainder__dew_point_2m'
 'remainder__precipitation' 'remainder__cloud_cover' 'remainder__is_day'
 'remainder__diffuse_radiation']
[16.857500076293945, 18.957500457763672, 19.457500457763672, 20.3075008392334, 21.907499313354492, 20.957500457763672, 23.407499313354492, 25.207500457763672, 25.457500457763672, 26.5575008392334, 26.357500076293945, 26.0575008392334, 23.857500076293945, 20.0575008392334, 20.607500076293945, 19.5575008392334, 19.00749969482422, 18.457500457763672, 18.707500457763672, 16.707500457763672, 16.907499313354492, 16.207500457763672, 16.00749969482422, 15.507499694824219, 15.557499885559082, 17.407499313354492, 19.0575008392334, 21.457500457763672, 23.657499313354492, 25.50749969482422, 26.8075008392334, 27.457500457763672, 27.907499313354492, 28.0575008392334, 27.957500457763672, 27.3075008392334, 27.3075008392334, 25.50749969482422, 21.75749969482422, 19.457500457763672, 18.657499313354492, 17.657499313354492, 16.707500457763672, 16.

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
for i in range(len(predictions_20)):
  print(f"{y_test_20[i]}  {predictions_20[i]}")

0.0  [-0.01779559]
0.0  [0.00345083]
0.0  [0.04938785]
0.0  [-0.03089701]
0.0  [0.00909953]
0.0  [0.01538204]
0.0  [0.03834464]
0.0  [0.00794513]
0.0  [-0.00154605]
0.0  [0.04684485]
0.0  [0.01357909]
0.0  [0.03660073]
0.0  [-0.0038519]
0.0  [-0.01031295]
0.0  [0.01055505]
0.0  [0.01125633]
0.0  [0.05045082]
0.0  [0.0037007]
0.0  [0.00776191]
0.0  [0.01050768]
0.0  [-0.06926161]
0.0  [0.01327034]
0.0  [-0.03206972]
0.0  [0.0533639]
0.0  [-0.00294903]
0.0  [-0.03286945]
0.0  [0.01019238]
0.0  [-0.02745265]
0.0  [0.04095551]
0.0  [0.00477759]
0.0  [-0.02477651]
0.0  [0.02979022]
0.0  [0.00988598]
0.0  [0.01055505]
0.10000000149011612  [0.00983606]
0.0  [0.01055505]
0.0  [0.01386825]
0.0  [0.01055505]
0.0  [0.0077253]
0.0  [0.02901131]
0.0  [0.01595106]
0.0  [0.03077976]
0.0  [0.03336083]
0.0  [0.00238397]
0.0  [-0.01361361]
0.0  [0.0797815]
0.0  [0.01055505]
0.0  [-0.01764097]
0.0  [0.01055505]
0.0  [-0.00144126]
0.0  [-0.06560966]
0.0  [0.00866807]
0.0  [-0.0385942]
0.0  [0.01017092]
0.

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Load data
data = hourly_dataframe  # Assuming hourly_dataframe is your dataset

# Calculate correlation between features and target variable
correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.1  # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()

# Define preprocessing steps for high correlation features using IterativeImputer
high_corr_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=42)),  # IterativeImputer for high correlation features
    # Add more preprocessing steps if necessary
])


# Define column transformer to apply different preprocessing steps to different feature groups
preprocessor = ColumnTransformer(
    transformers=[
        ('high_corr', high_corr_transformer, high_corr_features),
        ('low_corr', 'drop', low_corr_features)
    ])

# Apply preprocessing steps to the data
data_preprocessed = preprocessor.fit_transform(data)

# Get the feature names of the transformed data
feature_names_out = preprocessor.get_feature_names_out()

# Rearrange data into time series format
time_steps = 2  # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][0])

X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets (80%-20% and 90%-10%)
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_90, X_test_10, y_train_90, y_test_10 = train_test_split(X, y, test_size=0.1, random_state=42)

#Scale the features
scaler = StandardScaler()
X_train_80_scaled = scaler.fit_transform(X_train_80.reshape(-1, X_train_80.shape[-1])).reshape(X_train_80.shape)
X_test_20_scaled = scaler.transform(X_test_20.reshape(-1, X_test_20.shape[-1])).reshape(X_test_20.shape)
X_train_90_scaled = scaler.fit_transform(X_train_90.reshape(-1, X_train_90.shape[-1])).reshape(X_train_90.shape)
X_test_10_scaled = scaler.transform(X_test_10.reshape(-1, X_test_10.shape[-1])).reshape(X_test_10.shape)

# Define and train the Support Vector Regressor (SVR) model (80%-20% split)
svr_model_80 = SVR(kernel='rbf')  # RBF kernel is commonly used
svr_model_80.fit(X_train_80.reshape(X_train_80.shape[0], -1), y_train_80)

# Define and train the Support Vector Regressor (SVR) model (90%-10% split)
svr_model_90 = SVR(kernel='rbf')  # RBF kernel is commonly used
svr_model_90.fit(X_train_90.reshape(X_train_90.shape[0], -1), y_train_90)

# Make predictions
predictions_20 = svr_model_80.predict(X_test_20.reshape(X_test_20.shape[0], -1))
predictions_10 = svr_model_90.predict(X_test_10.reshape(X_test_10.shape[0], -1))



# Calculate Mean Squared Error for both splits
mse_20 = mean_squared_error(y_test_20, predictions_20)
mse_10 = mean_squared_error(y_test_10, predictions_10)

print("Mean Squared Error (80%-20% split):", mse_20)
print("Mean Squared Error (90%-10% split):", mse_10)

# Calculate MAE
mae = np.mean(np.abs(y_test_20 - predictions_20))
# Calculate RSE
rse = mse / np.var(y_test_20)
# Calculate RMSE
rmse = np.sqrt(mse_20)
print("Mean Squared Error:", mse_20)
print("mae:",mae)
print("rse:",rse)
print("rmse:",rmse)



Mean Squared Error (80%-20% split): 304.74293736260034
Mean Squared Error (90%-10% split): 275.7221567924782
Mean Squared Error: 304.74293736260034
mae: 10.727742196068093
rse: 0.7528117
rmse: 17.456887963282583


In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
data=hourly_dataframe
# Load the dataset
#data = pd.read_csv('your_dataset.csv')  # Replace 'your_dataset.csv' with the actual filename

# Calculate correlation between features and target variable
correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.1  # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold]
high_corr_features=high_corr_features.drop('cloud_cover')
high_corr_features = high_corr_features.index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()

# # Prepare the data for imputation
y = data['cloud_cover']
X = data.drop(columns=['cloud_cover'])


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for high correlation features using IterativeImputer
high_corr_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=20, random_state=42)),  # IterativeImputer for high correlation features
    # Add more preprocessing steps if necessary
])

# Define column transformer to apply different preprocessing steps to different feature groups
preprocessor = ColumnTransformer(
    transformers=[
        ('high_corr', high_corr_transformer, high_corr_features)
        ,('low_corr', 'drop', low_corr_features)
    ])

preprocessor.fit(X_train,y_train)
# Define the model
model = RandomForestRegressor()

# Create a pipeline with preprocessing and modeling steps
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Train the model
pipeline.fit(X_train, y_train)
#model.fit(X_train,y_train)

# Make predictions
predictions = pipeline.predict(X_test)
#predictions=model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)


Mean Squared Error: 328.4602858344121


In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

# Define preprocessing function
def preprocess_data(data):
    correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)
    correlation_threshold = 0.1
    high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
    low_corr_features = correlation[correlation < correlation_threshold].index.tolist()

    high_corr_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(max_iter=10, random_state=42))
    ])

    low_corr_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=3))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('high_corr', high_corr_transformer, high_corr_features),
            ('low_corr', 'drop', low_corr_features)
        ])

    data_preprocessed = preprocessor.fit_transform(data)
    return data_preprocessed, preprocessor.get_feature_names_out()

# Define function to create LSTM model
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=50, return_sequences=True), input_shape=input_shape))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(units=50)))
    model.add(Dropout(0.3))
    model.add(Dense(units=2))
    return model

# Define early stopping callback
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)

# Load and preprocess the data
data_preprocessed, feature_names_out = preprocess_data(hourly_dataframe)

# Rearrange data into time series format
time_steps = 5  # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][0])
X = np.array(X)
y = np.array(y)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape data for LSTM input
# X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
# X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Create and compile the model
model = create_lstm_model(input_shape=(X_train.shape[1], X_train.shape[2]))
model.compile(optimizer=Adam(lr=0.001), loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=64, verbose=1, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
mse = model.evaluate(X_test, y_test)
print("Mean Squared Error:", mse)



# Best Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional,Dropout
from keras.optimizers import Adam
data=hourly_dataframe

# Calculate correlation between features and target variable
correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.1 # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()


# Define preprocessing steps for high correlation features using IterativeImputer
high_corr_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=42)),  # IterativeImputer for high correlation features
    # Add more preprocessing steps if necessary
])

preprocessor = ColumnTransformer(
    transformers=[
        ('high_corr', high_corr_transformer, high_corr_features),
        ('low_corr', 'drop', low_corr_features)  # Drop low correlated features
    ],
    remainder='passthrough'  # Include any remaining columns not specified in transformers
)
# Apply preprocessing steps to the data
data_preprocessed = preprocessor.fit_transform(data)
# Get the feature names of the transformed data
feature_names_out = preprocessor.get_feature_names_out()

# Print the feature names
print(feature_names_out)


# Rearrange data into time series format
time_steps = 5  # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][0])
print(y)

X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets (80%-20% and 90%-10%)
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the bidirectional LSTM model
def create_bidirectional_lstm_model():
    model = Sequential()
    model.add(Bidirectional(LSTM(units=32, return_sequences=True), input_shape=(X_train_80.shape[1], X_train_80.shape[2])))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(units=32)))
    model.add(Dropout(0.3))
    model.add(Dense(1))
    return model

# Compile the model
model = create_bidirectional_lstm_model()
model.compile(optimizer=Adam(), loss='mean_squared_error')

# Train the model (80%-20% split)
model.fit(X_train_80, y_train_80, epochs=50, batch_size=32, verbose=1, validation_data=(X_test_20, y_test_20))
model.save("your_model_name.h5")
#To load tf.keras.models.load_model()

# Make predictions
predictions_20 = model.predict(X_test_20)


# Calculate Mean Squared Error for both splits
mse_20 = mean_squared_error(y_test_20, predictions_20)

print("Mean Squared Error (80%-20% split):", mse_20)


In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from keras.models import Sequential,load_model
from keras.layers import LSTM, Dense, Bidirectional, Dropout
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.constraints import non_neg
from keras import backend as K

# Assuming 'hourly_dataframe' is your dataset
data = hourly_dataframe.copy()
data=data.drop(columns=['date'])
# Handling Missing Values
# You can choose either IterativeImputer or KNNImputer
imputer = IterativeImputer(max_iter=10, random_state=42)
data_imputed = imputer.fit_transform(data)

# Convert the imputed data back to a DataFrame
data_imputed_df = pd.DataFrame(data_imputed, columns=data.columns)

# Calculate correlation between features and target variable
correlation = np.abs(data_imputed_df.corr()['cloud_cover']).sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.1 # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()

# Define the ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('low_corr', 'drop', low_corr_features)  # Drop low correlated features
    ],
    remainder='passthrough'  # Include any remaining columns not specified in transformers
)

# Apply preprocessing steps to the data
data_preprocessed = preprocessor.fit_transform(data_imputed_df)
# Get the feature names of the transformed data
feature_names_out = preprocessor.get_feature_names_out()
print(feature_names_out)
# Rearrange data into time series format
time_steps = 2 # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][3])  # Assuming index 0 is the target variable

X = np.array(X)
y = np.array(y)
# Split the data into training and testing sets (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=32)#42#32


# Define the bidirectional LSTM model
def create_bidirectional_lstm_model(input_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=84, return_sequences=True), input_shape=input_shape))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(units=84)))
    model.add(Dropout(0.3))
    model.add(Dense(1,kernel_constraint=non_neg()))
    return model

# Compile the model
print(X_train)
print(X_train.shape[1],',', X_train.shape[2])
model = create_bidirectional_lstm_model(input_shape=(X_train.shape[1], X_train.shape[2]))
model.compile(optimizer=Adam(),loss='mean_squared_error')

# Define a model checkpoint callback to save the model with the lowest validation loss
checkpoint_path = "best_model.h5"
model_checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)

# Train the model with the checkpoint callback
history = model.fit(X_train, y_train, epochs=32, batch_size=8, verbose=1, validation_data=(X_test, y_test), callbacks=[model_checkpoint])

# Load the best model from the checkpoint
best_model = load_model(checkpoint_path)

# Make predictions with the best model
predictions = best_model.predict(X_test)

# Ensure predictions are non-negative
predictions = np.maximum(predictions, 0.0)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
# Calculate MAE
mae = np.mean(np.abs(y_test - predictions))
# Calculate RSE
rse = mse / np.var(y_test)
# Calculate RMSE
rmse = np.sqrt(mse)
print("Mean Squared Error:", mse)
print("mae:",mae)
print("rse:",rse)
print("rmse:",rmse)



['remainder__temperature_2m' 'remainder__dew_point_2m'
 'remainder__precipitation' 'remainder__cloud_cover' 'remainder__is_day'
 'remainder__diffuse_radiation']
[[[ 19.5075    13.2575     0.         0.         0.         0.      ]
  [ 18.8575    13.6075     0.         0.         0.         0.      ]]

 [[ 27.9075    12.8075     0.         3.6        1.       126.      ]
  [ 27.3575    14.7075     0.         0.6        1.        95.      ]]

 [[ 30.3575    15.8075     0.        62.400005   1.       358.      ]
  [ 30.4075    15.9575     0.        60.300003   1.       315.      ]]

 ...

 [[ 25.5575    15.1575     0.        52.2        0.         0.      ]
  [ 25.6575    14.2575     0.        21.6        0.         0.      ]]

 [[ 28.0075    15.2075     0.         0.         1.        94.      ]
  [ 25.1075    15.6075     0.         0.6        1.        44.      ]]

 [[ 21.1575    20.3575     1.2       38.100002   0.         0.      ]
  [ 20.2575    19.8575     0.        17.7        0.  

  saving_api.save_model(


Epoch 2: val_loss improved from 186.64204 to 132.65544, saving model to best_model.h5
Epoch 3/32
Epoch 3: val_loss improved from 132.65544 to 131.66283, saving model to best_model.h5
Epoch 4/32
Epoch 4: val_loss improved from 131.66283 to 128.78586, saving model to best_model.h5
Epoch 5/32
Epoch 5: val_loss did not improve from 128.78586
Epoch 6/32
Epoch 6: val_loss did not improve from 128.78586
Epoch 7/32
Epoch 7: val_loss improved from 128.78586 to 120.84196, saving model to best_model.h5
Epoch 8/32
Epoch 8: val_loss did not improve from 120.84196
Epoch 9/32
Epoch 9: val_loss improved from 120.84196 to 118.56646, saving model to best_model.h5
Epoch 10/32
Epoch 10: val_loss did not improve from 118.56646
Epoch 11/32
Epoch 11: val_loss did not improve from 118.56646
Epoch 12/32
Epoch 12: val_loss did not improve from 118.56646
Epoch 13/32
Epoch 13: val_loss did not improve from 118.56646
Epoch 14/32
Epoch 14: val_loss did not improve from 118.56646
Epoch 15/32
Epoch 15: val_loss did n

In [None]:

for i in range(10):
  print(f"input data{X_test[i]} \n: Output:{y_test[i]}     {predictions[i]}")

input data[[ 30.7575  12.0575   0.       0.6      1.     151.    ]
 [ 31.0075  10.0075   0.       0.       1.     142.    ]] 
: Output:0.0     [1.6724901]
input data[[20.0075   17.1075    0.       21.        0.        0.      ]
 [19.4075   17.2575    0.       27.300001  0.        0.      ]] 
: Output:9.300000190734863     [26.405493]
input data[[21.8075 18.0075  0.      0.      1.     48.    ]
 [24.4575 18.4075  0.      0.      1.     95.    ]] 
: Output:0.0     [1.142267]
input data[[21.7575 14.0575  0.      0.      0.      0.    ]
 [21.0575 13.5575  0.      0.6     0.      0.    ]] 
: Output:0.0     [2.0475233]
input data[[ 28.5575    15.9075     0.        27.300001   1.       177.      ]
 [ 28.0075    13.4075     0.        38.4        1.       169.      ]] 
: Output:15.600000381469727     [31.62913]
input data[[21.4075 13.4075  0.     23.1     1.     48.    ]
 [23.1575 11.5575  0.      2.4     0.      2.    ]] 
: Output:44.70000076293945     [15.112878]
input data[[18.5575 18.4075  

# TRIAL

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional,Dropout
from keras.optimizers import Adam
data=hourly_dataframe.copy()
data.drop(columns=['date'], inplace=True)
# Calculate correlation between features and target variable
correlation = data.corr()['cloud_cover'].abs().sort_values(ascending=False)

# Define the threshold for correlation
correlation_threshold = 0.1 # Adjust as needed

# Split features into two groups based on correlation
high_corr_features = correlation[correlation >= correlation_threshold].index.tolist()
low_corr_features = correlation[correlation < correlation_threshold].index.tolist()


# Define preprocessing steps for high correlation features using IterativeImputer
high_corr_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=42)),  # IterativeImputer for high correlation features
    # Add more preprocessing steps if necessary
])

preprocessor = ColumnTransformer(
    transformers=[
        ('high_corr', high_corr_transformer, high_corr_features),
        ('low_corr', 'drop', low_corr_features)  # Drop low correlated features
    ],
    remainder='passthrough'  # Include any remaining columns not specified in transformers
)
# Apply preprocessing steps to the data
data_preprocessed = preprocessor.fit_transform(data)
# Get the feature names of the transformed data
feature_names_out = preprocessor.get_feature_names_out()

# Print the feature names
print(feature_names_out)


# Rearrange data into time series format
time_steps = 2 # Define the number of time steps
X = []
y = []
for i in range(len(data_preprocessed) - time_steps):
    X.append(data_preprocessed[i:i+time_steps])
    y.append(data_preprocessed[i+time_steps][0])
print(y)

X = np.array(X)
y = np.array(y)

# Split the data into training and testing sets (80%-20% and 90%-10%)
X_train_80, X_test_20, y_train_80, y_test_20 = train_test_split(X, y, test_size=0.2, random_state=42)
import numpy as np
import tensorflow as tf

# Load TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="/content/best_model.tflite")
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
input_shape = input_details[0]['shape']
print("Expected input shape:", input_shape)
output_details = interpreter.get_output_details()

# Prepare input data (similar to what was done during training)
# Assuming X_test_20 is your test data
# Prepare input data (similar to what was done during training)
input_data = np.array(X_test_20, dtype=np.float32)
print("Input data shape:", input_data.shape)

# Prepare an array to store predictions for each sample
predictions_tflite = []

# Loop through each sample in the input data
for sample in input_data:
    # Reshape each sample to match the expected input shape of the model
    sample = np.reshape(sample, (1,2,6))  # Exclude the batch dimension

    # Run inference for the current sample
    interpreter.set_tensor(input_details[0]['index'], sample)
    interpreter.invoke()
    prediction = interpreter.get_tensor(output_details[0]['index'])

    # Collect predictions
    predictions_tflite.append(prediction[0]/2.87)

# Convert predictions to a numpy array
predictions_tflite = np.array(predictions_tflite)

# Calculate Mean Squared Error for all predictions
for i in range(20):
  print(f"{X_test_20[i]} :{y_test_20[i]} : {predictions_tflite[i]}")
mse_tflite = mean_squared_error(y_test_20, predictions_tflite)

print("Mean Squared Error (TFLite) for all samples:", mse_tflite)





['high_corr__cloud_cover' 'high_corr__precipitation'
 'high_corr__diffuse_radiation' 'high_corr__dew_point_2m'
 'high_corr__is_day' 'high_corr__temperature_2m']
[18.600002, 16.5, 12.900001, 15.0, 18.0, 67.5, 60.300003, 82.200005, 19.5, 46.5, 59.100002, 13.500001, 15.6, 20.400002, 32.4, 13.200001, 13.200001, 7.8, 1.8000001, 0.6, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 10.5, 11.4, 6.6000004, 16.8, 5.4, 4.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.8000001, 4.2000003, 3.6, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6, 0.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.6, 4.8, 6.6000004, 7.2000003, 0.90000004, 0.0, 76.5, 65.7, 68.1, 31.8, 6.6000004, 9.6, 4.8, 11.400001, 21.0, 9.6, 4.8, 3.6000001, 0.6, 2.4, 9.3, 4.2000003, 17.400002, 22.5, 22.500002, 28.500002, 24.300001, 21.300001, 3.9, 0.3, 0.0, 0.0, 0.0, 0.0, 0.0, 9.6, 32.4, 12.0, 12.0, 26