# Process data, and split training, validation and test sets

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from fastbook import *
from fastai.tabular.all import *
from dtreeviz.trees import * 
 
pd.options.display.max_rows = 20
pd.options.display.max_columns = 8



In [3]:
path = Path('../../..')
Path.BASE_PATH = path


In [4]:
wdf_2016 = pd.read_csv(path/'data/processed/Onsite-MetMast-SCADA-data-2016.csv', low_memory = False)
wdf_2017 = pd.read_csv(path/'data/processed/Onsite-MetMast-SCADA-data-2017.csv', low_memory=False)
tdf_2016 = pd.read_csv(path/'data/processed/Wind-Turbine-SCADA-signals-2016.csv',low_memory = False)
tdf_2017 = pd.read_csv(path/'data/processed/Wind-Turbine-SCADA-signals-2017_0.csv', low_memory=False)

In [5]:
df_2016 = pd.merge(wdf_2016, tdf_2016, on='Timestamp', how='outer')
df_2017 = pd.merge(wdf_2017, tdf_2017, on='Timestamp', how='outer')


In [6]:
# Using pandas boolean indexing to create new dataframe where Turbine ID = T07
df_2016_T07 = df_2016[df_2016['Turbine_ID']=='T07'].copy().reset_index(drop=True)
df_2017_T07 = df_2017[df_2017['Turbine_ID']=='T07'].copy().reset_index(drop=True)

In [7]:
df_2016_T07, df_2017_T07 = add_datepart(df_2016_T07, 'Timestamp', drop=False), add_datepart(df_2017_T07, 'Timestamp',drop=False)



In [8]:
df_2016_T07

Unnamed: 0,Unnamed: 0_x,Timestamp,Min_Windspeed1,Max_Windspeed1,...,TimestampIs_quarter_start,TimestampIs_year_end,TimestampIs_year_start,TimestampElapsed
0,19960.0,2016-01-01 00:00:00+00:00,3.7,6.0,...,True,False,True,1.451606e+09
1,47293.0,2016-01-01 00:10:00+00:00,4.1,6.0,...,True,False,True,1.451607e+09
2,47291.0,2016-01-01 00:20:00+00:00,4.5,6.7,...,True,False,True,1.451608e+09
3,11435.0,2016-01-01 00:30:00+00:00,5.1,7.0,...,True,False,True,1.451608e+09
4,47287.0,2016-01-01 00:40:00+00:00,4.7,7.3,...,True,False,True,1.451609e+09
...,...,...,...,...,...,...,...,...,...
52440,19915.0,2016-12-31 23:10:00+00:00,4.3,8.4,...,False,True,False,1.483226e+09
52441,16704.0,2016-12-31 23:20:00+00:00,4.2,9.3,...,False,True,False,1.483226e+09
52442,50209.0,2016-12-31 23:30:00+00:00,3.7,8.6,...,False,True,False,1.483227e+09
52443,27903.0,2016-12-31 23:40:00+00:00,4.5,9.5,...,False,True,False,1.483228e+09


In [9]:
dep_var = "Gen_Bear_Temp_Avg"

In [10]:
procs = [Categorify, FillMissing]

In [11]:
training_percentage = 0.66 # split is 33% training data, 17% validation data and 50% test data like in Olivers rapport. 66% training here because we only use 2016 dataset for test/validation
split_index = int(len(df_2016_T07) * training_percentage)

# split dataframe
train = df_2016_T07.iloc[:split_index].index
validation = df_2016_T07.iloc[split_index:].index

splits = (list(train), list(validation))

In [12]:
split_index

34613

In [13]:
cont,cat = cont_cat_split(df_2016_T07,1,dep_var=dep_var)

In [14]:
to = TabularPandas(df_2016_T07, procs, cat, cont, y_names=dep_var, splits=splits)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

In [15]:
save_pickle(path/'models/splits.pkl',to)