In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Question 1



Display data

In [2]:
# read data
p1_generate = pd.read_csv('./data/Plant_1_Generation_Data.csv')
p1_weather = pd.read_csv('./data/Plant_1_Weather_Sensor_Data.csv')

print(p1_generate.head())
print(p1_weather.head())

          DATE_TIME  PLANT_ID       SOURCE_KEY  DC_POWER  AC_POWER  \
0  15-05-2020 00:00   4135001  1BY6WEcLGh8j5v7       0.0       0.0   
1  15-05-2020 00:00   4135001  1IF53ai7Xc0U56Y       0.0       0.0   
2  15-05-2020 00:00   4135001  3PZuoBAID5Wc2HD       0.0       0.0   
3  15-05-2020 00:00   4135001  7JYdWkrLSPkdwr4       0.0       0.0   
4  15-05-2020 00:00   4135001  McdE0feGgRqW7Ca       0.0       0.0   

   DAILY_YIELD  TOTAL_YIELD  
0          0.0    6259559.0  
1          0.0    6183645.0  
2          0.0    6987759.0  
3          0.0    7602960.0  
4          0.0    7158964.0  
             DATE_TIME  PLANT_ID       SOURCE_KEY  AMBIENT_TEMPERATURE  \
0  2020-05-15 00:00:00   4135001  HmiyD2TTLFNqkNe            25.184316   
1  2020-05-15 00:15:00   4135001  HmiyD2TTLFNqkNe            25.084589   
2  2020-05-15 00:30:00   4135001  HmiyD2TTLFNqkNe            24.935753   
3  2020-05-15 00:45:00   4135001  HmiyD2TTLFNqkNe            24.846130   
4  2020-05-15 01:00:00   4135

merge data

In [3]:
p1_generate['DATE_TIME'] = pd.to_datetime(p1_generate['DATE_TIME'])
p1_weather['DATE_TIME'] = pd.to_datetime(p1_weather['DATE_TIME'])
plant1_data = pd.merge(p1_generate, p1_weather, on='DATE_TIME')
print(plant1_data.head())

   DATE_TIME  PLANT_ID_x     SOURCE_KEY_x  DC_POWER  AC_POWER  DAILY_YIELD  \
0 2020-05-15     4135001  1BY6WEcLGh8j5v7       0.0       0.0          0.0   
1 2020-05-15     4135001  1IF53ai7Xc0U56Y       0.0       0.0          0.0   
2 2020-05-15     4135001  3PZuoBAID5Wc2HD       0.0       0.0          0.0   
3 2020-05-15     4135001  7JYdWkrLSPkdwr4       0.0       0.0          0.0   
4 2020-05-15     4135001  McdE0feGgRqW7Ca       0.0       0.0          0.0   

   TOTAL_YIELD  PLANT_ID_y     SOURCE_KEY_y  AMBIENT_TEMPERATURE  \
0    6259559.0     4135001  HmiyD2TTLFNqkNe            25.184316   
1    6183645.0     4135001  HmiyD2TTLFNqkNe            25.184316   
2    6987759.0     4135001  HmiyD2TTLFNqkNe            25.184316   
3    7602960.0     4135001  HmiyD2TTLFNqkNe            25.184316   
4    7158964.0     4135001  HmiyD2TTLFNqkNe            25.184316   

   MODULE_TEMPERATURE  IRRADIATION  
0           22.857507          0.0  
1           22.857507          0.0  
2          

  p1_generate['DATE_TIME'] = pd.to_datetime(p1_generate['DATE_TIME'])


# add col to the dataset

In [4]:
def create_features(df):
    df['YEAR'] = df['DATE_TIME'].dt.year
    df['MONTH'] = df['DATE_TIME'].dt.month
    df['DAY'] = df['DATE_TIME'].dt.day
    df['HOUR'] = df['DATE_TIME'].dt.hour
    df['MINUTE'] = df['DATE_TIME'].dt.minute
    df['DAYOFWEEK'] = df['DATE_TIME'].dt.dayofweek
    df['DATE'] = df['DATE_TIME'].dt.date
    df['TIME'] = df['DATE_TIME'].dt.time
    df['TOTAL_MINUTES_PASS'] = df['MINUTE'] + df['HOUR'] * 60
    if 'AC_POWER' in df.columns:
        df['PREV_DAY_AC_POWER'] = df['AC_POWER'].shift(24)
    return df

plant1_data = create_features(plant1_data)
plant1_data = plant1_data.dropna()

# Linear Regression Model with R^2 and MSE

In [5]:
features = ['YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'DAYOFWEEK', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION', 'PREV_DAY_AC_POWER']
target = 'AC_POWER'

# split 8:2 on the dataset
X_train, X_test, y_train, y_test = train_test_split(plant1_data[features], plant1_data[target], test_size=0.2, random_state=42)


lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)

print(f'Mean Squared Error: {mse:.2f}%')
print(f'R^2 Score: {r2 * 100:.2f}%')

Mean Squared Error: 3298.51%
R^2 Score: 97.85%
