In [16]:
import os
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest

%matplotlib inline

In [17]:
for dirname, _, filenames in os.walk('./solar_power_generation_data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./solar_power_generation_data/Plant_2_Generation_Data.csv
./solar_power_generation_data/Plant_1_Generation_Data.csv
./solar_power_generation_data/Plant_2_Weather_Sensor_Data.csv
./solar_power_generation_data/Plant_1_Weather_Sensor_Data.csv


In [18]:
generation1 = pd.read_csv(os.path.join(dirname, filenames[1]))
weather1 = pd.read_csv(os.path.join(dirname, filenames[3]))
generation1['DATE_TIME'] = pd.to_datetime(generation1['DATE_TIME'], dayfirst=True)
weather1['DATE_TIME'] = pd.to_datetime(weather1['DATE_TIME'], dayfirst=True)

In [19]:
generation1

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,2020-05-15 00:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.000,6259559.0
1,2020-05-15 00:00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.000,6183645.0
2,2020-05-15 00:00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.000,6987759.0
3,2020-05-15 00:00:00,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.000,7602960.0
4,2020-05-15 00:00:00,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.000,7158964.0
...,...,...,...,...,...,...,...
68773,2020-06-17 23:45:00,4135001,uHbuxQJl8lW7ozc,0.0,0.0,5967.000,7287002.0
68774,2020-06-17 23:45:00,4135001,wCURE6d3bPkepu2,0.0,0.0,5147.625,7028601.0
68775,2020-06-17 23:45:00,4135001,z9Y9gH1T5YWrNuG,0.0,0.0,5819.000,7251204.0
68776,2020-06-17 23:45:00,4135001,zBIq5rxdHJRwDNY,0.0,0.0,5817.000,6583369.0


In [20]:
inverters = list(generation1['SOURCE_KEY'].unique())
print(f"total number of inverters {len(inverters)}")

total number of inverters 22


In [21]:
inv_1 = generation1[generation1['SOURCE_KEY']==inverters[0]]
mask = ((weather1['DATE_TIME'] >= min(inv_1["DATE_TIME"])) & (weather1['DATE_TIME'] <= max(inv_1["DATE_TIME"])))
weather_filtered = weather1.loc[mask]

In [22]:
weather_filtered.shape

(3182, 6)

In [23]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=inv_1["DATE_TIME"], y=inv_1["AC_POWER"],
                         mode='lines',
                         name='AC Power'))

fig.add_trace(go.Scatter(x=weather_filtered["DATE_TIME"], y=weather_filtered["IRRADIATION"],
                         mode='lines',
                         name='Irradiation',
                         yaxis='y2'))

fig.update_layout(title_text="Irradiation vs AC POWER",
                  yaxis1=dict(title="AC Power in kW",
                              side='left'),
                  yaxis2=dict(title="Irradiation index",
                              side='right',
                              anchor="x",
                              overlaying="y"
                              ))

fig.show()

In [24]:
df = inv_1.merge(weather_filtered, on="DATE_TIME", how='left')
df = df[['DATE_TIME', 'AC_POWER', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']]
df

Unnamed: 0,DATE_TIME,AC_POWER,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15 00:00:00,0.0,25.184316,22.857507,0.0
1,2020-05-15 00:15:00,0.0,25.084589,22.761668,0.0
2,2020-05-15 00:30:00,0.0,24.935753,22.592306,0.0
3,2020-05-15 00:45:00,0.0,24.846130,22.360852,0.0
4,2020-05-15 01:00:00,0.0,24.621525,22.165423,0.0
...,...,...,...,...,...
3149,2020-06-17 22:45:00,0.0,22.150570,21.480377,0.0
3150,2020-06-17 23:00:00,0.0,22.129816,21.389024,0.0
3151,2020-06-17 23:15:00,0.0,22.008275,20.709211,0.0
3152,2020-06-17 23:30:00,0.0,21.969495,20.734963,0.0


In [25]:
train_prp = .6
train = df.loc[:df.shape[0]*train_prp]
test = df.loc[df.shape[0]*train_prp:]

features = ['AC_POWER', "IRRADIATION"]
clf = IsolationForest(n_estimators=1000, max_samples='auto', contamination=.03, max_features=2, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
clf.fit(train[features])
pred = clf.predict(test[features])

test['anomaly'] = pred
anomalies = test[test['anomaly'] == -1][['AC_POWER']]
anomalies = anomalies.rename(columns={'AC_POWER':'anomalies'})
test = test.merge(anomalies, left_index=True, right_index=True, how='left')
test


X does not have valid feature names, but IsolationForest was fitted with feature names



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,DATE_TIME,AC_POWER,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,anomaly,anomalies
1893,2020-06-04 20:15:00,0.0,24.101764,21.235858,0.0,1,
1894,2020-06-04 20:30:00,0.0,23.965206,21.246926,0.0,1,
1895,2020-06-04 20:45:00,0.0,23.835376,21.202568,0.0,1,
1896,2020-06-04 21:00:00,0.0,23.622412,20.969898,0.0,1,
1897,2020-06-04 21:15:00,0.0,23.477631,20.891598,0.0,1,
...,...,...,...,...,...,...,...
3149,2020-06-17 22:45:00,0.0,22.150570,21.480377,0.0,1,
3150,2020-06-17 23:00:00,0.0,22.129816,21.389024,0.0,1,
3151,2020-06-17 23:15:00,0.0,22.008275,20.709211,0.0,1,
3152,2020-06-17 23:30:00,0.0,21.969495,20.734963,0.0,1,


In [26]:
pred = clf.predict(train[features])

train['anomaly'] = pred
anomalies = train[train['anomaly'] == -1][['AC_POWER']]
anomalies = anomalies.rename(columns={'AC_POWER':'anomalies'})
train = train.merge(anomalies, left_index=True, right_index=True, how='left')
train



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,DATE_TIME,AC_POWER,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,anomaly,anomalies
0,2020-05-15 00:00:00,0.0,25.184316,22.857507,0.0,1,
1,2020-05-15 00:15:00,0.0,25.084589,22.761668,0.0,1,
2,2020-05-15 00:30:00,0.0,24.935753,22.592306,0.0,1,
3,2020-05-15 00:45:00,0.0,24.846130,22.360852,0.0,1,
4,2020-05-15 01:00:00,0.0,24.621525,22.165423,0.0,1,
...,...,...,...,...,...,...,...
1888,2020-06-04 19:00:00,0.0,24.911574,22.163503,0.0,1,
1889,2020-06-04 19:15:00,0.0,24.712389,21.731079,0.0,1,
1890,2020-06-04 19:30:00,0.0,24.546598,21.650517,0.0,1,
1891,2020-06-04 19:45:00,0.0,24.379535,21.817371,0.0,1,


In [27]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=test["DATE_TIME"], y=test["AC_POWER"],
                         mode='lines',
                         name='AC Power'))

fig.add_trace(go.Scatter(x=test["DATE_TIME"], y=test["anomalies"],
                         name='Anomaly',
                         mode='markers',
                         marker=dict(color="red",
                                     size=11,
                                     line=dict(color="red",
                                               width=2))))

fig.update_layout(title_text="Anomalies Detected using Isolation Forest",
                  yaxis1=dict(title="AC Power in kW"))

fig.show()