 **k-NN Module**


# Imports

In [None]:
# Imports
import plotly.express as px
import numpy as np
import pandas as pd
import datetime
#import time #PH - add if the code below requires it
from sklearn.model_selection import train_test_split

# Prepping Dataset

**Loading Data**

In [None]:
#https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92
url = 'https://github.com/phestvik/Amii_SolarPowerGenerationData/blob/main/'
raw_flag = '?raw=true'

file1 = url + 'Plant_1_Generation_Data.csv' + raw_flag
file2 = url + 'Plant_1_Weather_Sensor_Data.csv' + raw_flag
file3 = url + 'Plant_2_Generation_Data.csv' + raw_flag
file4 = url + 'Plant_2_Weather_Sensor_Data.csv' + raw_flag

df_plant1_gen = pd.read_csv(file1)
df_plant1_sensor = pd.read_csv(file2)
df_plant2_gen = pd.read_csv(file3)
df_plant2_sensor = pd.read_csv(file4)

In [None]:
df_plant1_sensor['SOURCE_KEY'].nunique()

1

In [None]:
df_plant1_sensor['PLANT_ID'].nunique()

1

In [None]:
#Renaming Source Key Column into its true descriptor for generation data
df_plant1_gen.rename(columns={'SOURCE_KEY':'INVERTER_ID'}, inplace=True)
df_plant2_gen.rename(columns={'SOURCE_KEY':'INVERTER_ID'}, inplace=True)
#Dropping Source Key Column from sensor data because it is the same for the entire file
df_plant1_sensor.drop(columns=['SOURCE_KEY'], inplace=True)
df_plant2_sensor.drop(columns=['SOURCE_KEY'], inplace=True)
#Dropping Plant ID from sensor data because it is the same for the entire file
df_plant1_sensor.drop(columns=['PLANT_ID'], inplace=True)
df_plant2_sensor.drop(columns=['PLANT_ID'], inplace=True)

**Transforming DATE_TIME from object to datetime64**

In [None]:
#Passing errors=’coerce’ will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT.
df_plant1_gen["DATE_TIME"] = pd.to_datetime(df_plant1_gen["DATE_TIME"], errors='coerce', format='%d-%m-%Y %H:%M')
df_plant1_sensor["DATE_TIME"] = pd.to_datetime(df_plant1_sensor["DATE_TIME"], errors='coerce', format='%Y-%m-%d %H:%M:%S')
df_plant2_gen["DATE_TIME"] = pd.to_datetime(df_plant2_gen["DATE_TIME"], errors='coerce', format='%Y-%m-%d %H:%M:%S')
df_plant2_sensor["DATE_TIME"] = pd.to_datetime(df_plant2_sensor["DATE_TIME"], errors='coerce', format='%Y-%m-%d %H:%M:%S')

**Combine Plant 1 and Sensor 1 | Plant 2 and Sensor 2**

In [None]:
display(df_plant1_gen)
display(df_plant1_sensor)

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD
0,2020-05-15 00:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.000,6259559.0
1,2020-05-15 00:00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.000,6183645.0
2,2020-05-15 00:00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.000,6987759.0
3,2020-05-15 00:00:00,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.000,7602960.0
4,2020-05-15 00:00:00,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.000,7158964.0
...,...,...,...,...,...,...,...
68773,2020-06-17 23:45:00,4135001,uHbuxQJl8lW7ozc,0.0,0.0,5967.000,7287002.0
68774,2020-06-17 23:45:00,4135001,wCURE6d3bPkepu2,0.0,0.0,5147.625,7028601.0
68775,2020-06-17 23:45:00,4135001,z9Y9gH1T5YWrNuG,0.0,0.0,5819.000,7251204.0
68776,2020-06-17 23:45:00,4135001,zBIq5rxdHJRwDNY,0.0,0.0,5817.000,6583369.0


Unnamed: 0,DATE_TIME,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15 00:00:00,25.184316,22.857507,0.0
1,2020-05-15 00:15:00,25.084589,22.761668,0.0
2,2020-05-15 00:30:00,24.935753,22.592306,0.0
3,2020-05-15 00:45:00,24.846130,22.360852,0.0
4,2020-05-15 01:00:00,24.621525,22.165423,0.0
...,...,...,...,...
3177,2020-06-17 22:45:00,22.150570,21.480377,0.0
3178,2020-06-17 23:00:00,22.129816,21.389024,0.0
3179,2020-06-17 23:15:00,22.008275,20.709211,0.0
3180,2020-06-17 23:30:00,21.969495,20.734963,0.0


In [None]:
dfplant1 = df_plant1_gen.merge(df_plant1_sensor, how='inner', left_on= 'DATE_TIME', right_on= 'DATE_TIME', sort=True)
dfplant2 = df_plant2_gen.merge(df_plant2_sensor, how='inner', left_on= 'DATE_TIME', right_on= 'DATE_TIME', sort=True)

In [None]:
dfplant1

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15 00:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.000,6259559.0,25.184316,22.857507,0.0
1,2020-05-15 00:00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.000,6183645.0,25.184316,22.857507,0.0
2,2020-05-15 00:00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.000,6987759.0,25.184316,22.857507,0.0
3,2020-05-15 00:00:00,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.000,7602960.0,25.184316,22.857507,0.0
4,2020-05-15 00:00:00,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.000,7158964.0,25.184316,22.857507,0.0
...,...,...,...,...,...,...,...,...,...,...
68769,2020-06-17 23:45:00,4135001,uHbuxQJl8lW7ozc,0.0,0.0,5967.000,7287002.0,21.909288,20.427972,0.0
68770,2020-06-17 23:45:00,4135001,wCURE6d3bPkepu2,0.0,0.0,5147.625,7028601.0,21.909288,20.427972,0.0
68771,2020-06-17 23:45:00,4135001,z9Y9gH1T5YWrNuG,0.0,0.0,5819.000,7251204.0,21.909288,20.427972,0.0
68772,2020-06-17 23:45:00,4135001,zBIq5rxdHJRwDNY,0.0,0.0,5817.000,6583369.0,21.909288,20.427972,0.0


In [None]:
# Create time from date time
dfplant1['HOUR'] = dfplant1['DATE_TIME'].dt.hour
dfplant1['HOUR'] = dfplant1['HOUR'].astype('int')
#dfplant1['DATE'] = dfplant1['DATE_TIME'].dt.date #PH - commented out. not sure if we need this for the analysis

dfplant2['HOUR'] = dfplant2['DATE_TIME'].dt.hour
dfplant2['HOUR'] = dfplant2['HOUR'].astype('int')
#dfplant2['DATE'] = dfplant2['DATE_TIME'].dt.date #PH - commented out. not sure if we need this for the analysis

In [None]:
dfplant1

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR
0,2020-05-15 00:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,0.000,6259559.0,25.184316,22.857507,0.0,0
1,2020-05-15 00:00:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,0.000,6183645.0,25.184316,22.857507,0.0,0
2,2020-05-15 00:00:00,4135001,3PZuoBAID5Wc2HD,0.0,0.0,0.000,6987759.0,25.184316,22.857507,0.0,0
3,2020-05-15 00:00:00,4135001,7JYdWkrLSPkdwr4,0.0,0.0,0.000,7602960.0,25.184316,22.857507,0.0,0
4,2020-05-15 00:00:00,4135001,McdE0feGgRqW7Ca,0.0,0.0,0.000,7158964.0,25.184316,22.857507,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
68769,2020-06-17 23:45:00,4135001,uHbuxQJl8lW7ozc,0.0,0.0,5967.000,7287002.0,21.909288,20.427972,0.0,23
68770,2020-06-17 23:45:00,4135001,wCURE6d3bPkepu2,0.0,0.0,5147.625,7028601.0,21.909288,20.427972,0.0,23
68771,2020-06-17 23:45:00,4135001,z9Y9gH1T5YWrNuG,0.0,0.0,5819.000,7251204.0,21.909288,20.427972,0.0,23
68772,2020-06-17 23:45:00,4135001,zBIq5rxdHJRwDNY,0.0,0.0,5817.000,6583369.0,21.909288,20.427972,0.0,23


# Feature Engineering

## Create inverter loading ratio (ILR)

In [None]:
dfplant1['IRL'] = dfplant1['DC_POWER'] / dfplant1['AC_POWER']
dfplant2['IRL'] = dfplant2['DC_POWER'] / dfplant2['AC_POWER']

## Create Peak Power Time

In [None]:
dfplant1[dfplant1['HOUR'].isna()]

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR,IRL


In [None]:
dfplant2[dfplant2['HOUR'].isna()]

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR,IRL


In [None]:
# Define function for grouping hours
def calc_time_peak_power(time):
    if time >= 9 and time < 15:
        return True
    else:
        return False

In [None]:
dfplant1['TIME_PEAK_POWER'] = dfplant1['HOUR'].apply(calc_time_peak_power).astype('category')
dfplant2['TIME_PEAK_POWER'] = dfplant2['HOUR'].apply(calc_time_peak_power).astype('category')

In [None]:
dfplant1[dfplant1['TIME_PEAK_POWER'].isna()]

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR,IRL,TIME_PEAK_POWER


In [None]:
dfplant2[dfplant2['TIME_PEAK_POWER'].isna()]

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR,IRL,TIME_PEAK_POWER


In [None]:
dfplant2.iloc[54155]

DATE_TIME              2020-06-11 14:00:00
PLANT_ID                           4136001
INVERTER_ID                PeE6FRyGXUgsRhN
DC_POWER                           531.207
AC_POWER                             520.9
DAILY_YIELD                           2792
TOTAL_YIELD                    1.34852e+09
AMBIENT_TEMPERATURE                29.9247
MODULE_TEMPERATURE                 37.9454
IRRADIATION                       0.355198
HOUR                                    14
IRL                                1.01979
TIME_PEAK_POWER                       True
Name: 54155, dtype: object

## Transform DC_POWER to Category

In [None]:
dfplant1['DC_POWER_CAT'] = pd.cut(dfplant1['DC_POWER'], bins=3, labels=['LOW','MEDIUM','HIGH'])
dfplant2['DC_POWER_CAT'] = pd.cut(dfplant2['DC_POWER'], bins=3, labels=['LOW','MEDIUM','HIGH'])
#dfplant1['DC_POWER_CAT'] = pd.cut(dfplant1['DC_POWER'], bins=5, labels=['VERY LOW','LOW','MEDIUM','HIGH','VERY HIGH'])
#dfplant1['DC_POWER_CAT'] = pd.cut(dfplant1['DC_POWER'], bins=10, labels=['1','2','3','4','5','6','7','8','9','10'])

In [None]:
dfplant1['DC_POWER_CAT'].value_counts()
px.histogram(dfplant1, x='DC_POWER_CAT')

# Additional EDA



Calculate the the DC Power at 12pm (peak power) for the entire plant.
Remember DC_POWER is in KW

In [None]:
#dfplant1.query(f'(DATE_TIME > "{datetime.date(2020,5,15)}") and (DATE_TIME < "{datetime.date(2020,5,16)}")')
dfplant1.query(f'(DATE_TIME == "{datetime.datetime(2020,5,15,12,0,0)}")').sum()

PLANT_ID                                                        90970022
INVERTER_ID            1BY6WEcLGh8j5v71IF53ai7Xc0U56Y3PZuoBAID5Wc2HD7...
DC_POWER                                                          155822
AC_POWER                                                         15250.8
DAILY_YIELD                                                      59620.9
TOTAL_YIELD                                                  1.50821e+08
AMBIENT_TEMPERATURE                                              688.718
MODULE_TEMPERATURE                                               1082.04
IRRADIATION                                                      11.3292
HOUR                                                                 264
IRL                                                              224.766
dtype: object

So Plant 1's peak power is ~156MW.

In [None]:
dfplant2.query(f'(DATE_TIME == "{datetime.datetime(2020,5,15,12,0,0)}")').DC_POWER.sum()

16969.051904761905

So Plant 2's peak power is ~17MW.

Check that MODULE_TEMPERATUERE AND IRRADIATION is the same for all INVERTER_IRs

In [None]:
dfplant1.query(f'(DATE_TIME == "{datetime.datetime(2020,5,15,12,0,0)}")')

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR,IRL,TIME_PEAK_POWER,DC_POWER_CAT
1036,2020-05-15 12:00:00,4135001,1BY6WEcLGh8j5v7,8232.5,805.1875,2382.875,6261941.875,31.305375,49.183584,0.514963,12,10.224327,True,MEDIUM
1037,2020-05-15 12:00:00,4135001,1IF53ai7Xc0U56Y,7013.142857,686.114286,2752.428571,6186397.429,31.305375,49.183584,0.514963,12,10.221537,True,MEDIUM
1038,2020-05-15 12:00:00,4135001,3PZuoBAID5Wc2HD,6402.571429,626.871429,2676.285714,6990435.286,31.305375,49.183584,0.514963,12,10.213532,True,MEDIUM
1039,2020-05-15 12:00:00,4135001,7JYdWkrLSPkdwr4,6910.714286,676.542857,2612.857143,7605572.857,31.305375,49.183584,0.514963,12,10.214747,True,MEDIUM
1040,2020-05-15 12:00:00,4135001,McdE0feGgRqW7Ca,6201.714286,607.442857,2888.714286,7161852.714,31.305375,49.183584,0.514963,12,10.209544,True,MEDIUM
1041,2020-05-15 12:00:00,4135001,VHMLBKoKgIrUVDU,7050.857143,690.171429,2644.0,7209052.0,31.305375,49.183584,0.514963,12,10.216095,True,MEDIUM
1042,2020-05-15 12:00:00,4135001,WRmjgnKYAwPKWDb,6951.428571,680.642857,2605.714286,7031278.714,31.305375,49.183584,0.514963,12,10.213034,True,MEDIUM
1043,2020-05-15 12:00:00,4135001,YxYtjZvoooNbGkE,8245.625,806.375,2745.625,7182711.625,31.305375,49.183584,0.514963,12,10.225546,True,MEDIUM
1044,2020-05-15 12:00:00,4135001,ZnxXDlPa8U1GXgE,7996.428571,782.142857,2688.285714,6524860.286,31.305375,49.183584,0.514963,12,10.223744,True,MEDIUM
1045,2020-05-15 12:00:00,4135001,ZoEaEvLYb1n2sOq,7931.5,775.775,2701.0,7100800.0,31.305375,49.183584,0.514963,12,10.22397,True,MEDIUM


In [None]:
#@title
dfplant1.query(f'(DATE_TIME == "{datetime.datetime(2020,5,15,12,0,0)}")').MODULE_TEMPERATURE.mean()

49.18358353333335

**Create a DF for Peak Power**

In [None]:
#@title
# PLANT 1
listPeak_Power = []
listModule_Temp = []
listIrradiation = []
listIRL = []


uniqueDate_Time = dfplant1['DATE_TIME'].unique()
for i in uniqueDate_Time:
  #nned to add a feature for # of inverters for each time stamp
  #to show this after do a histo or line plot to show the concept
  listPeak_Power.append(dfplant1.query(f'(DATE_TIME == "{i}")').DC_POWER.sum())
  listModule_Temp.append(dfplant1.query(f'(DATE_TIME == "{i}")').MODULE_TEMPERATURE.mean())
  listIrradiation.append(dfplant1.query(f'(DATE_TIME == "{i}")').IRRADIATION.mean())
  listIRL.append(dfplant1.query(f'(DATE_TIME == "{i}")').IRL.mean())

In [None]:
plant1 = pd.DataFrame(list(zip(list(uniqueDate_Time), listPeak_Power, listModule_Temp, listIrradiation, listIRL)), columns=['DATE_TIME','PEAK_POWER', 'MODULE_TEMP', 'IRRADIATION','IRL'])
plant1['PLANT'] = '1'

In [None]:
#@title
#PLANT 2
listPeak_Power = []
listModule_Temp = []
listIrradiation = []
listIRL=[]


uniqueDate_Time = dfplant2['DATE_TIME'].unique()
for i in uniqueDate_Time:
  listPeak_Power.append(dfplant2.query(f'(DATE_TIME == "{i}")').DC_POWER.sum())
  listModule_Temp.append(dfplant2.query(f'(DATE_TIME == "{i}")').MODULE_TEMPERATURE.mean())
  listIrradiation.append(dfplant2.query(f'(DATE_TIME == "{i}")').IRRADIATION.mean())
  listIRL.append(dfplant2.query(f'(DATE_TIME == "{i}")').IRL.mean())

In [None]:
plant2 = pd.DataFrame(list(zip(list(uniqueDate_Time), listPeak_Power, listModule_Temp, listIrradiation, listIRL)), columns=['DATE_TIME','PEAK_POWER', 'MODULE_TEMP', 'IRRADIATION','IRL'])
plant2['PLANT'] = '2'

In [None]:
# Append the two dataframes for plotting
plant = plant1.append(plant2, ignore_index=True)

In [None]:
numerical_features = ['PEAK_POWER', 'MODULE_TEMP', 'IRRADIATION','IRL']
for feature in numerical_features:
  fig = px.line(plant, x="DATE_TIME", y=feature, color='PLANT')
  fig.show()

# Assignment

**Notes**

Plant 1 and Plant 2 have different DC_POWER and IRL values. Therefore we will not be combining the datasets and will run two different models.

## Feature Scaling

### 1. Check extreme values:

a. Define what an extreme value is for each feature, e.g. using boxplots

In [None]:
# Plant 1
label = 'INVERTER_ID'
numerical_features = np.array(['DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION','IRL'])
for feature in numerical_features:
  fig = px.box(dfplant2, y=feature, width=640)
  fig.update_layout(
    title_font_color="green",
    title={
        'text':f'Plant 2 - {feature}',
        'y':.925,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
  fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
for feature in numerical_features:
  fig = px.box(dfplant2, y=feature, width=640)
  fig.update_layout(
    title_font_color="green",
    title={
        'text':f'Plant 2 - {feature}',
        'y':.925,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
  fig.show()

Output hidden; open in https://colab.research.google.com to view.

b. Do any of your features contain extreme values? List the features & the number of extreme values for each feature.


In [None]:
# Reference: https://towardsdatascience.com/detecting-and-treating-outliers-in-python-part-1-4ece5098b755

#Tukey's method
def tukeys_method(df, variable):
    #Takes two parameters: dataframe & variable of interest as string
    q1 = df[variable].quantile(0.25)
    q3 = df[variable].quantile(0.75)
    iqr = q3-q1
    inner_fence = 1.5*iqr
    outer_fence = 3*iqr
    
    #inner fence lower and upper end
    lower_inner_fence = q1-inner_fence
    upper_inner_fence = q3+inner_fence

    return(lower_inner_fence, upper_inner_fence)

In [None]:
dfplant1.describe()

Unnamed: 0,PLANT_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR,IRL
count,68774.0,68774.0,68774.0,68774.0,68774.0,68774.0,68774.0,68774.0,68774.0,36823.0
mean,4135001.0,3147.17745,307.778375,3295.834644,6978728.0,25.558521,31.244997,0.232305,11.567075,10.233666
std,0.0,4036.441826,394.394865,3145.220597,416270.7,3.3613,12.308283,0.301948,6.862651,0.048176
min,4135001.0,0.0,0.0,0.0,6183645.0,20.398505,18.140415,0.0,0.0,9.38155
25%,4135001.0,0.0,0.0,0.0,6512007.0,22.724491,21.123944,0.0,6.0,10.202579
50%,4135001.0,428.571429,41.45,2658.473214,7146685.0,24.670178,24.818984,0.03162,12.0,10.220237
75%,4135001.0,6365.46875,623.561161,6274.0,7268751.0,27.960429,41.693659,0.45488,17.0,10.248117
max,4135001.0,14471.125,1410.95,9163.0,7846821.0,35.252486,65.545714,1.221652,23.0,10.465522


In [None]:
# Note these calculations use the Tukey's method while Plotly Box Plot uses the linear method
# https://plotly.com/python/box-plots/#modifying-the-algorithm-for-computing-quartiles
print("Plant 1")
for feature in numerical_features:
  lower_inner_fence, upper_inner_fence = tukeys_method(dfplant1,feature)
  LIN_query = f'{feature} < {lower_inner_fence}'
  LIN_query_df = dfplant1.query(LIN_query)
  #LIF_num_examples = LIN_query_df.shape[0]
  LIF_num_examples = LIN_query_df[feature].count()
  LIF_num_examples_unique = LIN_query_df[feature].nunique()
  if LIF_num_examples > 0:
    print(f'{feature} - # of examples below the Lower Inner Fence ({lower_inner_fence:.4f}): {LIF_num_examples}/{dfplant1[feature].shape[0]} ({LIF_num_examples_unique} unique values)')

  UIN_query = f'{feature} > {upper_inner_fence}'
  UIN_query_df = dfplant1.query(UIN_query)
  UIF_num_examples = UIN_query_df[feature].count()
  UIF_num_examples_unique = UIN_query_df[feature].nunique()
  if UIF_num_examples > 0:
    print(f'{feature} - # of examples above the Upper Inner Fence ({upper_inner_fence:.4f}): {UIF_num_examples}/{dfplant1[feature].shape[0]} ({UIF_num_examples_unique} unique values)')

Plant 1
IRRADIATION - # of examples above the Upper Inner Fence (1.1372): 44/68774 (2 unique values)
IRL - # of examples below the Lower Inner Fence (10.1343): 19/68774 (19 unique values)
IRL - # of examples above the Upper Inner Fence (10.3164): 3697/68774 (3299 unique values)


In [None]:
# Note these calculations use the Tukey's method while Plotly Box Plot uses the linear method
# https://plotly.com/python/box-plots/#modifying-the-algorithm-for-computing-quartiles
print("Plant 2")
for feature in numerical_features:
  lower_inner_fence, upper_inner_fence = tukeys_method(dfplant2,feature)
  LIN_query = f'{feature} < {lower_inner_fence}'
  LIN_query_df = dfplant1.query(LIN_query)
  LIF_num_examples = LIN_query_df[feature].count()
  LIF_num_examples_unique = LIN_query_df[feature].nunique()
  if LIF_num_examples > 0:
    print(f'{feature} - # of examples below the Lower Inner Fence ({lower_inner_fence:.4f}): {LIF_num_examples}/{dfplant2[feature].shape[0]} ({LIF_num_examples_unique} unique values)')

  UIN_query = f'{feature} > {upper_inner_fence}'
  UIN_query_df = dfplant1.query(UIN_query)
  UIF_num_examples = UIN_query_df[feature].count()
  UIF_num_examples_unique = UIN_query_df[feature].nunique()
  if UIF_num_examples > 0:
    print(f'{feature} - # of examples above the Upper Inner Fence ({upper_inner_fence:.4f}): {UIF_num_examples}/{dfplant2[feature].shape[0]} ({UIF_num_examples_unique} unique values)')

Plant 2
DC_POWER - # of examples above the Upper Inner Fence (1116.4792): 31736/67698 (28764 unique values)
AC_POWER - # of examples above the Upper Inner Fence (1095.5375): 3638/67698 (3428 unique values)
MODULE_TEMPERATURE - # of examples above the Upper Inner Fence (64.5191): 22/67698 (1 unique values)
IRRADIATION - # of examples above the Upper Inner Fence (1.0776): 132/67698 (6 unique values)
IRL - # of examples above the Upper Inner Fence (1.0338): 36823/67698 (36122 unique values)


Note: For MODULE_TEMPERATURE, IRRADIATION, these counts are not unique. Each example (ie. for each inverter) will have the same value.

c. Do the number of extreme values make-up a substantial portion of that feature? (e.g. 5%) Or are they so few that you could consider dropping those samples?

For Plant 1:
*   The extreme values in IRRADIATION are a very small subset of the data
*   The extreme values in IRL are ~5% of the data. We will keep them in the analysis but we may disregard this feature in the model.

For Plant 2:
*   The extreme values in MODULE_TEMPERATURE and IRRADIATION are a very small subset
*   The extreme values in DC_POWER and IRL are ~50% of the data; however, from the EDA and specifically the line plots, the values appear to be valid
*   The extreme values in AC_POWER are ~5% of the data. We will keep them in the analysis but we may disregard this feature in the model.

The IRL of a solar power plant should be ~1.25. 
Plant 1's IRL values are too high and we are unsure of validity of the DC_POWER.



##2. Min-max Scale

The data must be split before we start

In [None]:
from sklearn.preprocessing import MinMaxScaler
plant1_train, plant1_test = train_test_split(dfplant1, test_size=0.2, random_state=42)
plant2_train, plant2_test = train_test_split(dfplant2, test_size=0.2, random_state=42)

In [None]:
minmax_scaler = MinMaxScaler()

X_cols_num = ['AC_POWER','DAILY_YIELD','TOTAL_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','HOUR']
X_cols_cat = ['TIME_PEAK_POWER']
y_cols = ['DC_POWER_CAT']

#Drop X cols not in the list
#plant1_train = plant1_train[plant1_train.columns.intersection(X_cols_num + X_cols_cat)] #PH - this isn't necessary but it doesn't hurt

#MMS on the numeric cols only
#PH - maybe instead of creating a new df, pd.DataFrame, try using .loc[row_indexer,col_indexer] = value instead
X_plant2_train = pd.DataFrame(minmax_scaler.fit_transform(plant2_train[X_cols_num]), columns=X_cols_num)
X_plant2_train[X_cols_cat] = plant2_train[X_cols_cat]

#saving the normalized data to a variable per PM3
X_plant2_train_mms = X_plant2_train
y_plant2_train_mms = plant2_train[y_cols]

In [None]:
#PH - trying something else out
X_cols_num = ['AC_POWER','DAILY_YIELD','TOTAL_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','HOUR']
X_cols_cat = ['TIME_PEAK_POWER']
y_cols = ['DC_POWER_CAT']

X_train, X_test, y_train, y_test = \
  train_test_split(dfplant2[X_cols_num + X_cols_cat], dfplant2[y_cols], test_size=0.2, random_state=42)

minmax_scaler = MinMaxScaler()
X_train.loc[:, X_cols_num] = minmax_scaler.fit_transform(X_train[X_cols_num])
X_test.loc[:, X_cols_num] = minmax_scaler.fit_transform(X_test[X_cols_num])

X_plant2_train_mms = X_train
y_plant2_train_mms = y_train          

##3. Standardize

The data must be split before we start

In [None]:
plant1_train, plant1_test = train_test_split(dfplant1, test_size=0.2, random_state=42)
plant2_train, plant2_test = train_test_split(dfplant2, test_size=0.2, random_state=42)

X_plant1_train = plant1_train[['AC_POWER','DAILY_YIELD','TOTAL_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','HOUR','TIME_PEAK_POWER']]
y_plant1_train = plant1_train[['DC_POWER_CAT']]
X_plant1_test = plant1_test[['AC_POWER','DAILY_YIELD','TOTAL_YIELD','AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','HOUR','TIME_PEAK_POWER']]
y_plant1_test = plant1_test[['DC_POWER_CAT']]

X_plant2_train = plant2_train[['AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','HOUR']]
y_plant2_train = plant2_train[['DC_POWER_CAT']]
X_plant2_test = plant2_test[['AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','IRRADIATION','HOUR']]
y_plant2_test = plant2_test[['DC_POWER_CAT']]

In [None]:
from sklearn.preprocessing import StandardScaler
z_score_scaler = StandardScaler()
X_plant2_train_num=X_plant2_train.iloc[:, :-1]
X_plant2_train.iloc[:, :-1]=z_score_scaler.fit_transform(X_plant2_train_num)
X_plant2_train_standard=X_plant2_train.iloc[:, :-1]
X_plant2_test_num = X_plant2_test.iloc[:,: -1]
X_plant2_test.iloc[:, :-1]=z_score_scaler.fit_transform(X_plant2_test_num)
X_plant2_test_standard = X_plant2_test.iloc[:,:-1]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
X_plant2_train_standard.describe()

Unnamed: 0,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
count,54158.0,54158.0,54158.0
mean,-3.311809e-16,-1.51378e-16,4.866917e-16
std,1.000009,1.000009,1.000009
min,-1.750417,-1.100157,-0.7419058
25%,-0.850383,-0.7956495,-0.7419058
50%,-0.2667801,-0.4619789,-0.6800694
75%,0.7256863,0.6657112,0.6519996
max,2.778286,3.027962,2.807649


## 4. Visual & Numeric Comparison 1

Picking 3 features: HOUR, MODULE_TEMPERATURE, IRRADIATION

In [None]:
cols_3features = ['MODULE_TEMPERATURE','IRRADIATION']

print('Mean')
for feature in cols_3features:
  print(f'{feature} - Min-max Scaling: {X_plant2_train_mms[feature].mean():.04f} | Standardize: {X_plant2_train_standard[feature].mean():.04f}   ')

print('Standard Deviation') 
for feature in cols_3features:
  print(f'{feature} - Min-max Scaling: {X_plant2_train_mms[feature].std():.04f} | Standardize: {X_plant2_train_standard[feature].std():.04f}   ')
  

Mean
MODULE_TEMPERATURE - Min-max Scaling: 0.2665 | Standardize: -0.0000   
IRRADIATION - Min-max Scaling: 0.2090 | Standardize: 0.0000   
Standard Deviation
MODULE_TEMPERATURE - Min-max Scaling: 0.2422 | Standardize: 1.0000   
IRRADIATION - Min-max Scaling: 0.2817 | Standardize: 1.0000   


## 5. Visual & Numeric Comparison 2

In [None]:
for feature in cols_3features:
  fig = px.box(plant2_train, y=feature, width=640)
  fig.update_layout(
    title_font_color="green",
    title={
        'text':f'{feature} - Raw',
        'y':.925,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
  fig.show()

  fig = px.box(X_plant2_train_mms, y=feature, width=640)
  fig.update_layout(
    title_font_color="green",
    title={
        'text':f'{feature} - Min Max Scaled',
        'y':.925,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
  fig.show()

  fig = px.box(X_plant2_train_standard, y=feature, width=640)
  fig.update_layout(
    title_font_color="green",
    title={
        'text':f'{feature} - Standardized',
        'y':.925,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
  fig.show()


Output hidden; open in https://colab.research.google.com to view.

## k-NN

### Exploring k-NN Construction

In [None]:
from  sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
knn5_u = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn5_d = neighbors.KNeighborsClassifier(n_neighbors=5, weights='distance')
knn11 = neighbors.KNeighborsClassifier(n_neighbors=11)
knn15 = neighbors.KNeighborsClassifier(n_neighbors=15)

In [None]:
X_plant2_train_standard.describe()

Unnamed: 0,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
count,54158.0,54158.0,54158.0
mean,-3.311809e-16,-1.51378e-16,4.866917e-16
std,1.000009,1.000009,1.000009
min,-1.750417,-1.100157,-0.7419058
25%,-0.850383,-0.7956495,-0.7419058
50%,-0.2667801,-0.4619789,-0.6800694
75%,0.7256863,0.6657112,0.6519996
max,2.778286,3.027962,2.807649


In [None]:
X_train = X_plant2_train_standard #y_plant2_train
y_train = y_plant2_train
knn.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
yhat_train = knn.predict(X_train)
accuracy_score(y_train, yhat_train)

0.9119982274086931

In [None]:
# X_plant2_train_standard, y_plant2_train
# X_plant2_train_mms, y_plant2_train_mms
knn5_u.fit(X_plant2_train_standard, y_plant2_train.values.ravel())
knn5_d.fit(X_plant2_train_standard, y_plant2_train.values.ravel())
knn11.fit(X_plant2_train_standard,y_plant2_train.values.ravel())
knn15.fit(X_plant2_train_standard,y_plant2_train.values.ravel())
#knn5_dist = knn5.fit(X_plant2_train_standard, y_plant2_train.values.ravel(), weights='distance')

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')

### k-NN Evaluation

#### Decision Boundaries

In [None]:
# select two features for plotting decision boundaries
X2_train = X_plant2_train_standard[['MODULE_TEMPERATURE','IRRADIATION']].values
X2_test = X_plant2_test_standard[['MODULE_TEMPERATURE','IRRADIATION']].values
#X_train_db=X_plant2_train_standard.values

In [None]:
y_test

In [None]:
# change y_train and y_test to numeric for viz purposes 
levels = ['LOW','MEDIUM','HIGH']
numeric_levels = [1,2,3]
y_train=y_train.replace(levels,numeric_levels)
y_test=y_test.replace(levels,numeric_levels)


In [None]:
knn5_u.fit(X2_train,y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
x_mins = np.min(X_plant2_train_standard, axis=0)
x_maxs = np.max(X_plant2_train_standard, axis=0)



In [None]:
x0_vis_range = np.arange(x_mins[0] - 0.1, x_maxs[0] + 0.1, 0.02)
x1_vis_range = np.arange(x_mins[1] - 0.1, x_maxs[1] + 0.1, 0.02)
XX0_vis, XX1_vis = np.meshgrid(x0_vis_range, x1_vis_range)
X_vis = np.c_[XX0_vis.flatten(), XX1_vis.flatten()]



In [None]:
yhat_vis = knn5_u.predict(X_vis)


In [None]:
YYhat_vis = yhat_vis.reshape(XX0_vis.shape)
yhat_test = knn5_u.predict(X2_test)

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Heatmap(z=YYhat_vis, 
                        x=x0_vis_range,
                        y=x1_vis_range,
                        showscale=False))
fig.add_trace(go.Scatter(x=X2_train[:, 0],
                        y=X2_train[:, 1],
                        mode='markers',
                        marker_color=y_train, 
                        marker_line_width=1))
fig.add_trace(go.Scatter(x=X2_test[:, 0],
                        y=X2_test[:, 1],
                        mode='markers',
                    marker_color=yhat_test,
                        marker_line_width=3))

fig.update_layout(showlegend=False)
fig.update_xaxes(range=[x_mins[0] - 0.1, x_maxs[0] + 0.1],
                title='MODULE_TEMPERATURE')
fig.update_yaxes(range=[x_mins[1] - 0.1, x_maxs[1] + 0.1],
                title='IRRADIATION')

fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# change y_train and y_test back to categorical
numeric_levels = [1,2,3]
levels = ['LOW','MEDIUM','HIGH']
y_train=y_train.replace(numeric_levels,levels)
y_test=y_test.replace(numeric_levels,levels)

#### Accuracy Score

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
dfplant2

Unnamed: 0,DATE_TIME,PLANT_ID,INVERTER_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR,IRL,TIME_PEAK_POWER,DC_POWER_CAT
0,2020-05-15 00:00:00,4136001,4UPUqMRk7TRMgml,0.0,0.0,9425.000000,2.429011e+06,27.004764,25.060789,0.0,0,,False,LOW
1,2020-05-15 00:00:00,4136001,81aHJ1q11NBPMrL,0.0,0.0,0.000000,1.215279e+09,27.004764,25.060789,0.0,0,,False,LOW
2,2020-05-15 00:00:00,4136001,9kRcWv60rDACzjR,0.0,0.0,3075.333333,2.247720e+09,27.004764,25.060789,0.0,0,,False,LOW
3,2020-05-15 00:00:00,4136001,Et9kgGMDl729KT4,0.0,0.0,269.933333,1.704250e+06,27.004764,25.060789,0.0,0,,False,LOW
4,2020-05-15 00:00:00,4136001,IQ2d7wF4YD8zU1Q,0.0,0.0,3177.000000,1.994153e+07,27.004764,25.060789,0.0,0,,False,LOW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67693,2020-06-17 23:45:00,4136001,q49J1IKaHRwDQnt,0.0,0.0,4157.000000,5.207580e+05,23.202871,22.535908,0.0,23,,False,LOW
67694,2020-06-17 23:45:00,4136001,rrq4fwE8jgrTyWY,0.0,0.0,3931.000000,1.211314e+08,23.202871,22.535908,0.0,23,,False,LOW
67695,2020-06-17 23:45:00,4136001,vOuJvMaM2sgwLmb,0.0,0.0,4322.000000,2.427691e+06,23.202871,22.535908,0.0,23,,False,LOW
67696,2020-06-17 23:45:00,4136001,xMbIugepa2P7lBB,0.0,0.0,4218.000000,1.068964e+08,23.202871,22.535908,0.0,23,,False,LOW


In [None]:
dfplant2.describe()

Unnamed: 0,PLANT_ID,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR,IRL
count,67698.0,67698.0,67698.0,67698.0,67698.0,67698.0,67698.0,67698.0,67698.0,32036.0
mean,4136001.0,246.701961,241.277825,3294.890295,658944800.0,27.986756,32.607233,0.229204,11.530208,1.023772
std,0.0,370.569597,362.112118,2919.448386,729667800.0,4.021294,11.226446,0.309365,6.922127,0.005292
min,4136001.0,0.0,0.0,0.0,0.0,20.942385,20.265123,0.0,0.0,0.991749
25%,4136001.0,0.0,0.0,272.75,19964940.0,24.570349,23.685627,0.0,6.0,1.020151
50%,4136001.0,0.0,0.0,2911.0,282627600.0,26.910352,27.433723,0.018554,12.0,1.022043
75%,4136001.0,446.591667,438.215,5534.0,1348495000.0,30.912601,40.019036,0.431027,18.0,1.025626
max,4136001.0,1420.933333,1385.42,9873.0,2247916000.0,39.181638,66.635953,1.098766,23.0,1.095543


In [None]:
dfplant22 = dfplant2.drop(columns=["DATE_TIME","PLANT_ID","INVERTER_ID","TIME_PEAK_POWER","IRL"])

In [None]:
dfplant22.dtypes

DC_POWER                float64
AC_POWER                float64
DAILY_YIELD             float64
TOTAL_YIELD             float64
AMBIENT_TEMPERATURE     float64
MODULE_TEMPERATURE      float64
IRRADIATION             float64
HOUR                      int64
DC_POWER_CAT           category
dtype: object

In [None]:
df_plant2_train, df_plant2_test = train_test_split(dfplant22, test_size = 0.2)
X_dfplant22_train = df_plant2_train.iloc[:,:-1]
y_dfplant22_train = df_plant2_train["DC_POWER_CAT"]
X_dfplant22_train

Unnamed: 0,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOUR
35623,0.000000,0.000000,3880.000000,1.215399e+09,27.327333,25.931245,0.000000,19
5409,66.546667,65.053333,2063.933333,1.784237e+06,36.103419,59.100530,0.688748,13
2976,1052.080000,1028.420000,2025.066667,2.826041e+08,32.422923,56.885681,0.821055,10
64953,173.950000,170.228571,5082.071429,1.795112e+09,29.002872,31.513063,0.117213,16
42892,11.953333,11.526667,0.466667,8.385822e+08,23.302659,22.519347,0.011218,6
...,...,...,...,...,...,...,...,...
22670,0.000000,0.000000,0.000000,1.795017e+09,25.632463,24.421985,0.000000,4
52210,425.014286,417.392857,5238.357143,2.623937e+06,28.792319,31.002017,0.275930,16
58095,0.000000,0.000000,0.000000,0.000000e+00,28.721466,45.337588,0.503415,10
8755,0.000000,0.000000,0.000000,2.091601e+08,25.132742,23.860702,0.000000,3


In [None]:
X_dfplant22_test = df_plant2_test.iloc[:,:-1]
y_dfplant22_test = df_plant2_test["DC_POWER_CAT"]

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_dfplant22 = KNeighborsClassifier(n_neighbors=11)
knn_dfplant22.fit(X_dfplant22_train, y_dfplant22_train)

In [None]:
Yhat_dfplant2_train = knn_dfplant22.predict(X_dfplant22_train)
accuracy_score(y_dfplant22_train, Yhat_dfplant2_train)

0.8989992244913032

In [None]:
Yhat_dfplant2_test = knn_dfplant22.predict(X_dfplant22_test)
accuracy_score(y_dfplant22_test,Yhat_dfplant2_test)

0.8665435745937962

#### Evaluation Metrics

**Precision Score**

In [None]:
from sklearn.metrics import precision_score


In [None]:
display(precision_score(y_dfplant22_train, Yhat_dfplant2_train, average='micro'))
display(precision_score(y_dfplant22_test,Yhat_dfplant2_test, average = 'micro'))

0.8989992244913032

0.8665435745937962

**Recall Score**

In [None]:
from sklearn.metrics import recall_score

In [None]:
display(recall_score(y_dfplant22_train,Yhat_dfplant2_train, average = 'micro'))
display(recall_score(y_dfplant22_test,Yhat_dfplant2_test, average ='micro'))

0.8989992244913032

0.8665435745937962

**F1 Score**

In [None]:
from sklearn.metrics import f1_score

In [None]:
display(f1_score(y_dfplant22_train,Yhat_dfplant2_train, average='micro'))
display(f1_score(y_dfplant22_test,Yhat_dfplant2_test, average='micro'))

0.8989992244913032

0.8665435745937962

#### Learning Curve

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
# Standardized Data
'''
X_train = X_plant2_train_standard
y_train = y_plant2_train
'''
#Min-max Scale Data
X_train = X_plant2_train_mms
y_train = y_plant2_train_mms

data_sizes, training_scores, validation_scores = learning_curve(knn, X_train, y_train.values.ravel(), cv=10, scoring='accuracy', train_sizes=np.linspace(0.01, 1.0, 51), random_state=42)

In [None]:
display(data_sizes)

In [None]:
display(training_scores)
display(training_scores.shape)

In [None]:
display(validation_scores)
display(validation_scores.shape)

In [None]:
training_mean = training_scores.mean(axis=1) 
training_standard_deviation = training_scores.std(axis=1) 

In [None]:
validation_mean = validation_scores.mean(axis=1) 
validation_standard_deviation = validation_scores.std(axis=1)

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=data_sizes, 
                        y=training_mean,
                        mode='lines',
                        name='Training',
                        line=dict(color='red')))
fig.add_trace(go.Scatter(x=data_sizes, 
                        y=training_mean - training_standard_deviation,
                        mode='lines',
                        name='Training lower bound',
                        line=dict(width=0, color='red'),
                        showlegend=False))
fig.add_trace(go.Scatter(x=data_sizes, 
                        y=training_mean + training_standard_deviation,
                        mode='lines',
                        name='Training upper bound',
                        line=dict(width=0, color='red'),
                        fill='tonexty',
                        fillcolor='rgba(255, 0, 0, 0.3)',
                        showlegend=False))

fig.add_trace(go.Scatter(x=data_sizes, 
                        y=validation_mean,
                        mode='lines',
                        name='Validation',
                        line=dict(color='blue')))
fig.add_trace(go.Scatter(x=data_sizes, 
                        y=validation_mean - validation_standard_deviation,
                        mode='lines',
                        name='Validation lower bound',
                        line=dict(width=0, color='blue'),
                        showlegend=False))
fig.add_trace(go.Scatter(x=data_sizes, 
                        y=validation_mean + validation_standard_deviation,
                        mode='lines',
                        name='Validation upper bound',
                        line=dict(width=0, color='blue'),
                        fill='tonexty',
                        fillcolor='rgba(0, 0, 255, 0.3)',
                        showlegend=False))

fig.update_layout(title='Learning curve',
                 xaxis_title='Dataset size',
                 yaxis_title='Accuracy')
fig.show()