In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor

In [114]:
df = pd.read_csv('/content/sample_data/SolarPrediction.csv')

In [115]:
#show first five rows of the dataset
df.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475229326,9/29/2016 12:00:00 AM,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00
1,1475229023,9/29/2016 12:00:00 AM,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00
2,1475228726,9/29/2016 12:00:00 AM,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00
3,1475228421,9/29/2016 12:00:00 AM,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00
4,1475228124,9/29/2016 12:00:00 AM,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00


In [116]:
#number of rows and columns and the dataset
df.shape

(32686, 11)

In [117]:
#find information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32686 entries, 0 to 32685
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                32686 non-null  int64  
 1   Data                    32686 non-null  object 
 2   Time                    32686 non-null  object 
 3   Radiation               32686 non-null  float64
 4   Temperature             32686 non-null  int64  
 5   Pressure                32686 non-null  float64
 6   Humidity                32686 non-null  int64  
 7   WindDirection(Degrees)  32686 non-null  float64
 8   Speed                   32686 non-null  float64
 9   TimeSunRise             32686 non-null  object 
 10  TimeSunSet              32686 non-null  object 
dtypes: float64(4), int64(3), object(4)
memory usage: 2.7+ MB


In [118]:
#statistcal analysis of the dataset
df.describe()

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
count,32686.0,32686.0,32686.0,32686.0,32686.0,32686.0,32686.0
mean,1478047000.0,207.124697,51.103255,30.422879,75.016307,143.489821,6.243869
std,3005037.0,315.916387,6.201157,0.054673,25.990219,83.1675,3.490474
min,1472724000.0,1.11,34.0,30.19,8.0,0.09,0.0
25%,1475546000.0,1.23,46.0,30.4,56.0,82.2275,3.37
50%,1478026000.0,2.66,50.0,30.43,85.0,147.7,5.62
75%,1480480000.0,354.235,55.0,30.46,97.0,179.31,7.87
max,1483265000.0,1601.26,71.0,30.56,103.0,359.95,40.5


In [119]:
#find null values in the dataset
df.isnull().sum()

UNIXTime                  0
Data                      0
Time                      0
Radiation                 0
Temperature               0
Pressure                  0
Humidity                  0
WindDirection(Degrees)    0
Speed                     0
TimeSunRise               0
TimeSunSet                0
dtype: int64

# **Data Clean and Preprocessing**

In [120]:
# Convert 'Time' column to datetime
df['Time'] = pd.to_datetime(df['Time'])

# Extract hour, minute, and second from the 'Time' column
df['Hour'] = df['Time'].dt.hour
df['Minute'] = df['Time'].dt.minute
df['Second'] = df['Time'].dt.second

# Convert time to seconds since midnight
df['Time In Seconds'] = df['Hour'] * 3600 + df['Minute'] * 60 + df['Second']

# Drop unnecessary columns
df.drop(['Time', 'Hour', 'Minute', 'Second'], axis=1, inplace=True)

In [121]:
df.head()

Unnamed: 0,UNIXTime,Data,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet,Time In Seconds
0,1475229326,9/29/2016 12:00:00 AM,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00,86126
1,1475229023,9/29/2016 12:00:00 AM,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00,85823
2,1475228726,9/29/2016 12:00:00 AM,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00,85526
3,1475228421,9/29/2016 12:00:00 AM,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00,85221
4,1475228124,9/29/2016 12:00:00 AM,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00,84924


In [122]:
# Convert 'TimeSunRise' and 'TimeSunSet' columns to datetime
df['TimeSunRise'] = pd.to_datetime(df['TimeSunRise']).dt.strftime('%H:%M:%S')
df['TimeSunSet'] = pd.to_datetime(df['TimeSunSet']).dt.strftime('%H:%M:%S')

# Extract hour, minute, and second from the 'TimeSunRise' and 'TimeSunSet' columns
df['SunRiseHour'] = pd.to_datetime(df['TimeSunRise']).dt.hour
df['SunRiseMinute'] = pd.to_datetime(df['TimeSunRise']).dt.minute
df['SunSetHour'] = pd.to_datetime(df['TimeSunSet']).dt.hour
df['SunSetMinute'] = pd.to_datetime(df['TimeSunSet']).dt.minute

# Convert sunrise and sunset times to seconds since midnight
df['Sun Rise In Seconds'] = df['SunRiseHour'] * 3600 + df['SunRiseMinute'] * 60
df['Sun Set In Seconds'] = df['SunSetHour'] * 3600 + df['SunSetMinute'] * 60

# Drop unnecessary columns
df.drop(['TimeSunRise', 'TimeSunSet', 'SunRiseHour', 'SunRiseMinute', 'SunSetHour', 'SunSetMinute'], axis=1, inplace=True)


In [123]:
df[['Sun Rise In Seconds', 'Sun Set In Seconds']].head()

Unnamed: 0,Sun Rise In Seconds,Sun Set In Seconds
0,22380,65580
1,22380,65580
2,22380,65580
3,22380,65580
4,22380,65580


In [124]:
#drop UNIXTime and Data column
df.drop(['UNIXTime', 'Data'], axis=1, inplace=True)

In [125]:
#find duplicate from the dataset
df.duplicated().sum()

0

In [126]:
#findal dataset five rows
df.head()

Unnamed: 0,UNIXTime,Data,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Time In Seconds,Sun Rise In Seconds,Sun Set In Seconds
0,1475229326,1475107200,1.21,48,30.46,59,177.39,5.62,86126,22380,65580
1,1475229023,1475107200,1.21,48,30.46,58,176.78,3.37,85823,22380,65580
2,1475228726,1475107200,1.23,48,30.46,57,158.75,3.37,85526,22380,65580
3,1475228421,1475107200,1.21,48,30.46,60,137.71,3.37,85221,22380,65580
4,1475228124,1475107200,1.17,48,30.46,62,104.95,5.62,84924,22380,65580


In [127]:
# Check data types
print(df.dtypes)

UNIXTime                    int64
Data                        int64
Radiation                 float64
Temperature                 int64
Pressure                  float64
Humidity                    int64
WindDirection(Degrees)    float64
Speed                     float64
Time In Seconds             int64
Sun Rise In Seconds         int64
Sun Set In Seconds          int64
dtype: object


# **Build Preditive Model**

In [None]:
# Drop the target variable and select features
X = df.drop(columns=['Radiation'])
y = df['Radiation']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Define the Random Forest pipeline and parameter grid
random_forest_pipeline = Pipeline([('random_forest', RandomForestRegressor())])
random_forest_param_grid = {'random_forest__n_estimators': [50, 100, 200]}

# Perform Grid Search for Random Forest
random_forest_grid_search = GridSearchCV(random_forest_pipeline, random_forest_param_grid, cv=5, scoring='neg_mean_squared_error')
random_forest_grid_search.fit(X_train, y_train)

# Get the best Random Forest model
best_random_forest_model = random_forest_grid_search.best_estimator_

# Make predictions using the best Random Forest model
y_pred_rf = best_random_forest_model.predict(X_test)

# Calculate Mean Squared Error (MSE) and R-squared (R2)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest:")
print(f"Mean Squared Error (MSE): {mse_rf}")
print(f"R-squared (R2): {r2_rf}")
