## Solar Prediction - Linear Regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression

# Load the data
data = pd.read_csv('/Users/ben/Desktop/Senior-project/Final_Merged_Energy_and_Weather_Data.csv')        # change path to run...


In [2]:
# Create lagged features for past solar output (lag 1 and lag 2)
data['solar_output_lag1'] = data['Solar'].shift(1)
data['solar_output_lag2'] = data['Solar'].shift(2)
data['solar_output_lag3'] = data['Solar'].shift(3)

# Drop any rows with missing values due to lagging
data.dropna(inplace=True)




In [3]:

# Define features and target (same as in your decision tree)
# 'temp', 'solarradiation',
features = ['uvindex', 'precip',  'solarenergy', 'solar_output_lag1', 'solar_output_lag2', 'snow', 'cloudcover', 'humidity']
X = data[features]
y = data['Solar']

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, shuffle=True)


In [4]:

# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


In [None]:

# Make predictions on the test set
y_pred_linear = linear_model.predict(X_test)

# Calculate RMSE to evaluate performance
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print(f'Linear Regression RMSE: {rmse_linear}')

# # Plot Actual vs Predicted Solar Output (for Linear Regression)
# plt.figure(figsize=(50, 6))
# plt.plot(y_test.index, y_test, label='Actual Solar Output')
# plt.plot(y_test.index, y_pred_linear, label='Predicted Solar Output (Linear)', color='green')
# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M'))
# plt.gca().xaxis.set_major_locator(mdates.DayLocator(interval=1))  # Tick marks every day
# plt.gcf().autofmt_xdate()  # Rotate the dates for better readability
# plt.legend()
# plt.title('Linear Regression: Actual vs Predicted Solar Output (Every 5 minutes)')
# plt.xlabel('Time')
# plt.ylabel('Solar Output')
# plt.show()

# Export results to CSV
results_linear = pd.DataFrame({
    'Time': y_test.index,
    'Actual Solar Output': y_test,
    'Predicted Solar Output (Linear)': y_pred_linear
})

# Save to CSV file
results_linear.to_csv('predicted_solar_output_linear.csv', index=False)


In [117]:
coef = zip(X.columns, linear_model.coef_)
for feature, coeff in coef:
    print(f"{feature}: {coeff}")

uvindex: -0.6026775702565531
precip: -49.39020583092826
solarenergy: 18.14464175783743
solar_output_lag1: 0.9052876807378496
solar_output_lag2: 0.03927842879159016
snow: -1.8074430840897548e-13
cloudcover: -0.44432420385341853
humidity: -0.24035741085145113
