In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv("solar_visualization.csv")
data.head()

Unnamed: 0,Country,State,Administrative Region,City,GrossPower,MainOrientation,NetRatedPower,FeedInType,AssignedActivePowerInverter,NumberOfModules,...,CommissioningMonth,CommissioningDay,RegistrationYear,RegistrationMonth,RegistrationDay,TiltSinRange,TiltCosRange,TimeSinceCommissioning,Efficiency,PowerPerModule
0,Germany,Nordrhein-Westfalen,Münster,Münster,3.96,South,3.96,Full Feed-in,4.0,22.0,...,7,20,2019,2,1,"(0.3090169943749474, 0.5877852522924731)","(0.9510565162951535, 0.8090169943749475)",6369,1.0,0.18
1,Germany,Baden-Württemberg,Ostalbkreis,Schwäbisch Gmünd,7.41,South,7.41,Partial Feed-in,8.3,38.0,...,1,31,2019,1,31,"(0.3090169943749474, 0.5877852522924731)","(0.9510565162951535, 0.8090169943749475)",4347,1.0,0.195
2,Germany,Brandenburg,Havelland,Nauen,5.04,South,5.0,Partial Feed-in,5.0,16.0,...,2,19,2019,1,31,"(0.3090169943749474, 0.5877852522924731)","(0.9510565162951535, 0.8090169943749475)",3233,0.992063,0.315
3,Germany,Bayern,Regensburg,Pentling,6.36,South-West,6.0,Partial Feed-in,6.0,24.0,...,12,16,2019,1,31,"(0.3090169943749474, 0.5877852522924731)","(0.9510565162951535, 0.8090169943749475)",2932,0.943396,0.265
4,Germany,Saarland,Saarlouis,Saarlouis,7.2,West,7.2,Partial Feed-in,7.6,30.0,...,12,8,2019,1,31,"(0.3090169943749474, 0.5877852522924731)","(0.9510565162951535, 0.8090169943749475)",4767,1.0,0.24


In [3]:
data.duplicated().sum()

23164

In [4]:
data=data.drop_duplicates()
data.shape

(3963767, 24)

In [5]:
data_model=data[['State', 'Administrative Region', 'City','MainOrientation',
                 'FeedInType', 'AssignedActivePowerInverter','Location',
                 'NumberOfModules','GrossPower', 'NetRatedPower',
                 ]]

In [6]:
Q1=data_model.quantile(0.25)
Q3=data_model.quantile(0.75)
IQR=Q3

In [7]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [8]:
# Ensure lower_bound and upper_bound are DataFrames aligned to data
lower_bound_df = pd.DataFrame([lower_bound] * len(data_model), columns=data_model.columns, index=data_model.index)
upper_bound_df = pd.DataFrame([upper_bound] * len(data_model), columns=data_model.columns, index=data_model.index)

In [9]:
# Apply the filtering logic
mask = ~((data_model < lower_bound_df) | (data_model > upper_bound_df)).any(axis=1)
data_model_no_outliers = data_model[mask]

In [10]:
X=data_model_no_outliers[['State', 'Administrative Region', 'City','MainOrientation',
                          'FeedInType', 'AssignedActivePowerInverter','Location', 'NumberOfModules']]

y = data_model_no_outliers[["GrossPower", "NetRatedPower"]]

In [11]:
X.head()

Unnamed: 0,State,Administrative Region,City,MainOrientation,FeedInType,AssignedActivePowerInverter,Location,NumberOfModules
0,Nordrhein-Westfalen,Münster,Münster,South,Full Feed-in,4.0,"Structural installations (house roof, building...",22.0
1,Baden-Württemberg,Ostalbkreis,Schwäbisch Gmünd,South,Partial Feed-in,8.3,"Structural installations (house roof, building...",38.0
2,Brandenburg,Havelland,Nauen,South,Partial Feed-in,5.0,"Structural installations (house roof, building...",16.0
3,Bayern,Regensburg,Pentling,South-West,Partial Feed-in,6.0,"Structural installations (house roof, building...",24.0
4,Saarland,Saarlouis,Saarlouis,West,Partial Feed-in,7.6,"Structural installations (house roof, building...",30.0


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2823751, 8) (705938, 8) (2823751, 2) (705938, 2)


In [15]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [16]:
X_train.describe()

Unnamed: 0,AssignedActivePowerInverter,NumberOfModules
count,2823751.0,2823751.0
mean,7.763958,27.38333
std,4.826545,19.07174
min,0.0,1.0
25%,4.6,16.0
50%,8.0,24.0
75%,10.0,34.0
max,27.5,105.0


In [17]:
X_train.columns

Index(['State', 'Administrative Region', 'City', 'MainOrientation',
       'FeedInType', 'AssignedActivePowerInverter', 'Location',
       'NumberOfModules'],
      dtype='object')

In [18]:
numerical_fatures=["AssignedActivePowerInverter", 'NumberOfModules']
categorical_fatures=['State', 'Administrative Region', 'City', 'MainOrientation', 'FeedInType', 'Location',]

In [19]:
numerical_scaler=MinMaxScaler()
categorical_encoder=OneHotEncoder(sparse=True, handle_unknown='ignore')

In [20]:
# Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", numerical_scaler, numerical_fatures),
        ("categorical", categorical_encoder, categorical_fatures),
    ]
)

In [21]:
# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Apply the same transformation to the test data
X_test_transformed = preprocessor.transform(X_test)



In [22]:
print(X_train_transformed.shape, X_test_transformed.shape, y_train.shape, y_test.shape)

(2823751, 10957) (705938, 10957) (2823751, 2) (705938, 2)


In [26]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

In [27]:
# Initialize the model
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)

In [28]:
# Train the model
xgb_model.fit(X_train_transformed, y_train)

In [31]:
# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test_transformed)

In [32]:
# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost - Mean Squared Error:", mse_xgb)
print("XGBoost - R2 Score:", r2_xgb)


XGBoost - Mean Squared Error: 1.1424822592161583
XGBoost - R2 Score: 0.9551025752724178


In [33]:
from sklearn.model_selection import RandomizedSearchCV

In [34]:
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'n_estimators': [50, 100, 200],
    'gamma': [0, 1, 5],
    'min_child_weight': [1, 2, 3]
}

In [35]:
xgb_random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=5, verbose=2, random_state=42)
xgb_random_search.fit(X_train_transformed, y_train)
print(xgb_random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.9; total time= 4.2min
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.9; total time= 4.9min
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.9; total time= 4.8min
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.9; total time= 4.9min
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.05, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.9; total time= 4.8min
[CV] END colsample_bytree=0.9, gamma=5, learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=50, subsample=1; total time= 2.0min
[CV] END colsample_bytree=0.9, gamma=5, learning_rate=0.01, max_depth=6,

In [36]:
best_params = xgb_random_search.best_params_

In [37]:
best_xgb_model = xgb.XGBRegressor(**best_params, random_state=42)

In [38]:
best_xgb_model.fit(X_train_transformed, y_train)

In [39]:
# Predict on the test set
y_pred_xgb_best = best_xgb_model.predict(X_test_transformed)

In [40]:
# Evaluate the model
mse_xgb_best = mean_squared_error(y_test, y_pred_xgb_best)
r2_xgb_best = r2_score(y_test, y_pred_xgb_best)

print("XGBoost - Mean Squared Error:", mse_xgb_best)
print("XGBoost - R2 Score:", r2_xgb_best)


XGBoost - Mean Squared Error: 1.0978795102484857
XGBoost - R2 Score: 0.9568432445552162


In [41]:
import joblib

In [42]:
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', best_xgb_model)
])

In [43]:
# Save the pipeline to a file
joblib.dump(full_pipeline, "xgb_full_pipeline.pkl")

print("Full pipeline saved as xgb_full_pipeline.pkl")

Full pipeline saved as xgb_full_pipeline.pkl


In [44]:
# Load the pipeline
loaded_pipeline = joblib.load("xgb_full_pipeline.pkl")

In [45]:
# Use the loaded pipeline to make predictions
y_pred_loaded = loaded_pipeline.predict(X_test)

print(y_pred_loaded)

[[6.6572146 3.687137 ]
 [5.796834  5.731938 ]
 [8.248131  8.219451 ]
 ...
 [7.6353245 7.4523635]
 [5.786848  4.9900026]
 [4.312643  3.917955 ]]
