In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [5]:
df=pd.read_csv("solarpowergeneration.csv",index_col=False)
df
df.isnull().sum()
df['average-wind-speed-(period)']=df['average-wind-speed-(period)'].fillna(df['average-wind-speed-(period)'].median())

def cap_outliers(df, column):
    low = df[column].quantile(0.25)
    high = df[column].quantile(0.75)
    df[column] = np.where(df[column] < low, low, df[column])
    df[column] = np.where(df[column] > high, high, df[column])
    return df

# Apply capping to features (not target)
feature_cols = df.columns.drop('power-generated')
for col in feature_cols:
    if np.issubdtype(df[col].dtype, np.number):
        df = cap_outliers(df, col)
print("Zero values:", (df['power-generated'] == 0).sum())
print("Negative values:", (df['power-generated'] < 0).sum())

# Apply log1p transformation (handles zero values safely)
df['log_power_generated'] = np.log1p(df['power-generated'])

def find_multicollinear_features(dataset, threshold=0.8):
    r = dataset.corr()  # Compute correlation matrix
    col_corr = set()  # Store highly correlated columns
    
    for i in range(len(r.columns)):
        for j in range(i):
            if abs(r.iloc[i, j]) > threshold:  # Check correlation magnitude
                col_name = r.columns[i]
                col_corr.add(col_name)  # Add column to set
                
    return col_corr 

high_corr_features = find_multicollinear_features(df, 0.6)
print("Columns to Drop:", high_corr_features)

df.drop(columns=['average-wind-speed-(period)'],inplace=True)
features=df.drop(['power-generated','log_power_generated'],axis=1)
target=df['log_power_generated']
## scaling 
feature_names =['distance-to-solar-noon','temperature','wind-direction','wind-speed','sky-cover','visibility','humidity','average-pressure-(period)']

scal=StandardScaler()

scaled_features=scal.fit_transform(features)
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(scaled_features,target,test_size=0.2, random_state=42)



# XGBoost Regressor Model
model_xgb = xgb.XGBRegressor(learning_rate=0.1,max_depth=3,n_estimators=100)
model_xgb.fit(X_train, y_train)

joblib.dump(model_xgb,"xgb_model.pkl")
# Save Scaler
joblib.dump(scal, "scaler.pkl")
# Save Feature Names
joblib.dump(feature_names, "feature_names.pkl")

print("✅ Model, Scaler & Feature Names Saved!")



Zero values: 1320
Negative values: 0
Columns to Drop: {'power-generated', 'average-wind-speed-(period)', 'log_power_generated'}
✅ Model, Scaler & Feature Names Saved!
