In [153]:
from sklearn.feature_selection import mutual_info_regression
import xgboost as xgb
import pandas as pd

In [155]:
# Load your data (replace these lines with your actual data loading code)
X_train_observed = pd.read_csv("X_train_observed.csv")
train_targets = pd.read_csv("train_targets.csv")

# Align your data based on timestamps (if needed)
aligned_data = pd.merge(train_targets, X_train_observed, left_on='time', right_on='date_forecast', how='inner')
y_aligned = aligned_data['pv_measurement']
X_aligned = aligned_data.drop(['time', 'pv_measurement', 'date_forecast'], axis=1)

# Split the data into training and validation sets
# Time-based Splitting (replace 'time' and 'date_forecast' with your actual timestamp columns)
aligned_data = aligned_data.sort_values(by='time')  # Sorting by time
train_size = int(len(aligned_data) * 0.8)  # Using 80% for training

# Split the aligned data into training and validation sets based on time
train_data = aligned_data[:train_size]
val_data = aligned_data[train_size:]

# Separate features and target variable for both training and validation sets
X_train = train_data.drop(['time', 'pv_measurement', 'date_forecast'], axis=1)
y_train = train_data['pv_measurement']
X_val = val_data.drop(['time', 'pv_measurement', 'date_forecast'], axis=1)
y_val = val_data['pv_measurement']

# XGBoost Feature Importance
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror')
xgb_model.fit(X_train, y_train)
xgb_feature_importance = xgb_model.feature_importances_
xgb_feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'XGB_Importance': xgb_feature_importance})

# Pearson's Correlation (Excluding the timestamp column)
correlation_matrix = aligned_data.drop(['time', 'date_forecast'], axis=1).corr()
correlation_with_target = correlation_matrix['pv_measurement'].sort_values(ascending=False)

# Mutual Information
mutual_info = mutual_info_regression(X_train, y_train)
mutual_info_df = pd.DataFrame({'Feature': X_train.columns, 'Mutual_Info': mutual_info})

# Combine All Feature Importances
combined_feature_importance_df = pd.merge(pd.merge(xgb_feature_importance_df, correlation_with_target, left_on='Feature', right_index=True), mutual_info_df, on='Feature')
combined_feature_importance_df['Total_Importance'] = combined_feature_importance_df['XGB_Importance'] + combined_feature_importance_df['pv_measurement'] + combined_feature_importance_df['Mutual_Info']

# Sort by Total Importance
sorted_combined_feature_importance_df = combined_feature_importance_df.sort_values(by='Total_Importance', ascending=False)

# Display the top 10 most consistently important features
top_10_combined_features = sorted_combined_feature_importance_df.head(10)
print(top_10_combined_features)

                  Feature  XGB_Importance  pv_measurement  Mutual_Info  \
10           direct_rad:W        0.560381        0.853315     0.737941   
4         clear_sky_rad:W        0.042977        0.803990     0.845601   
36        sun_elevation:d        0.013142        0.691070     0.914885   
8           diffuse_rad:W        0.058068        0.705704     0.802399   
3   clear_sky_energy_1h:J        0.004355        0.781647     0.739144   
11        direct_rad_1h:J        0.005773        0.828454     0.673928   
9        diffuse_rad_1h:J        0.005839        0.689930     0.699093   
19             is_day:idx        0.000000        0.537770     0.565029   
35          sun_azimuth:d        0.039064       -0.064520     0.622470   
38            t_1000hPa:K        0.012213        0.338399     0.088628   

    Total_Importance  
10          2.151636  
4           1.692568  
36          1.619097  
8           1.566170  
3           1.525147  
11          1.508155  
9           1.394862  
1