# Data cleaning and feature engineering

In [1]:
# import libraries
import numpy as np
import pandas as pd

In [2]:
# read the dataframe
data_load_path = '../data/raw/' 
data_load_name = '50.861781_4.457452_Solcast_PT60M.csv'
df = pd.read_csv(data_load_path + data_load_name)

In [3]:
# printout the column names
df.columns

Index(['PeriodEnd', 'PeriodStart', 'Period', 'AirTemp', 'Azimuth',
       'CloudOpacity', 'DewpointTemp', 'Dhi', 'Dni', 'Ebh', 'Ghi',
       'GtiFixedTilt', 'GtiTracking', 'PrecipitableWater', 'RelativeHumidity',
       'SnowWater', 'SurfacePressure', 'WindDirection10m', 'WindSpeed10m',
       'Zenith', 'AlbedoDaily'],
      dtype='object')

In [4]:
# select some features
df = df[['PeriodEnd', 'PeriodStart', 'AirTemp', 'Dhi', 'Dni', 'Ghi',
       'PrecipitableWater', 'RelativeHumidity', 'SurfacePressure',
       'WindDirection10m', 'WindSpeed10m']]

In [5]:
# randomly display 10 rows
df.sample(10)

Unnamed: 0,PeriodEnd,PeriodStart,AirTemp,Dhi,Dni,Ghi,PrecipitableWater,RelativeHumidity,SurfacePressure,WindDirection10m,WindSpeed10m
1331,2019-02-24T13:00:00Z,2019-02-24T12:00:00Z,13.5,68,874,486,9.7,33.8,1026.5,151,2.1
10921,2020-03-30T03:00:00Z,2020-03-30T02:00:00Z,-1.2,0,0,0,4.0,65.3,1026.4,27,5.0
10838,2020-03-26T16:00:00Z,2020-03-26T15:00:00Z,10.1,93,689,360,5.7,40.5,1012.8,66,5.6
16150,2020-11-03T00:00:00Z,2020-11-02T23:00:00Z,9.8,0,0,0,10.9,77.6,1012.1,238,6.1
6255,2019-09-17T17:00:00Z,2019-09-17T16:00:00Z,15.5,75,191,117,19.1,62.3,1016.3,349,5.1
11412,2020-04-19T14:00:00Z,2020-04-19T13:00:00Z,14.8,211,0,211,17.6,70.4,1010.1,60,5.6
19562,2021-03-25T04:00:00Z,2021-03-25T03:00:00Z,6.3,0,0,0,16.0,74.2,1013.5,241,2.3
1382,2019-02-26T16:00:00Z,2019-02-26T15:00:00Z,15.0,49,695,221,6.4,51.4,1021.6,115,2.4
1839,2019-03-17T17:00:00Z,2019-03-17T16:00:00Z,6.8,99,113,121,7.6,58.9,997.0,266,6.4
2725,2019-04-23T15:00:00Z,2019-04-23T14:00:00Z,22.6,281,353,504,17.6,39.3,985.5,106,4.1


In [6]:
old_names = ['PeriodEnd', 'PeriodStart', 'AirTemp', 'Dhi', 'Dni', 'Ghi','PrecipitableWater', 'RelativeHumidity',
             'SurfacePressure', 'WindDirection10m', 'WindSpeed10m'] 
new_names = ['PeriodEnd','PeriodStart','Temperature', 'DHI', 'DNI', 'Radiation', 'Precipitation', 'Humidity', 
             'Pressure', 'WindDirection', 'WindSpeed']
df.rename(columns = dict(zip(old_names, new_names)), inplace = True)

In [7]:
# display dataframe
df.head()

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed
0,2018-12-31T02:00:00Z,2018-12-31T01:00:00Z,8.2,0,0,0,15.5,98.3,1023.9,283,3.4
1,2018-12-31T03:00:00Z,2018-12-31T02:00:00Z,8.0,0,0,0,15.0,98.1,1024.0,284,3.6
2,2018-12-31T04:00:00Z,2018-12-31T03:00:00Z,7.9,0,0,0,14.5,98.0,1024.0,286,3.6
3,2018-12-31T05:00:00Z,2018-12-31T04:00:00Z,7.8,0,0,0,14.0,98.0,1024.0,288,3.5
4,2018-12-31T06:00:00Z,2018-12-31T05:00:00Z,7.8,0,0,0,13.6,98.1,1024.0,291,3.4


In [8]:
# show a summary of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25989 entries, 0 to 25988
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PeriodEnd      25989 non-null  object 
 1   PeriodStart    25989 non-null  object 
 2   Temperature    25989 non-null  float64
 3   DHI            25989 non-null  int64  
 4   DNI            25989 non-null  int64  
 5   Radiation      25989 non-null  int64  
 6   Precipitation  25989 non-null  float64
 7   Humidity       25989 non-null  float64
 8   Pressure       25989 non-null  float64
 9   WindDirection  25989 non-null  int64  
 10  WindSpeed      25989 non-null  float64
dtypes: float64(5), int64(4), object(2)
memory usage: 2.2+ MB


In [9]:
 # interpret columns as appropriate data types to ensure compatibility
df['Radiation']     = df['Radiation'].astype(float)
df['Temperature']   = df['Temperature'].astype(float) # or int
df['Pressure']      = df['Pressure'].astype(float)
df['Humidity']      = df['Humidity'].astype(int) # or int
df['WindDirection'] = df['WindDirection'].astype(float)
df['WindSpeed']     = df['WindSpeed'].astype(float)
df['PeriodStart'] = pd.to_datetime(df['PeriodStart']).dt.to_period('T').dt.to_timestamp()
df['PeriodEnd'] = pd.to_datetime(df['PeriodEnd']).dt.to_period('T').dt.to_timestamp()



In [10]:
# display dataframe summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25989 entries, 0 to 25988
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PeriodEnd      25989 non-null  datetime64[ns]
 1   PeriodStart    25989 non-null  datetime64[ns]
 2   Temperature    25989 non-null  float64       
 3   DHI            25989 non-null  int64         
 4   DNI            25989 non-null  int64         
 5   Radiation      25989 non-null  float64       
 6   Precipitation  25989 non-null  float64       
 7   Humidity       25989 non-null  int32         
 8   Pressure       25989 non-null  float64       
 9   WindDirection  25989 non-null  float64       
 10  WindSpeed      25989 non-null  float64       
dtypes: datetime64[ns](2), float64(6), int32(1), int64(2)
memory usage: 2.1 MB


In [11]:
# display top 6 rows
df.head(6)

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed
0,2018-12-31 02:00:00,2018-12-31 01:00:00,8.2,0,0,0.0,15.5,98,1023.9,283.0,3.4
1,2018-12-31 03:00:00,2018-12-31 02:00:00,8.0,0,0,0.0,15.0,98,1024.0,284.0,3.6
2,2018-12-31 04:00:00,2018-12-31 03:00:00,7.9,0,0,0.0,14.5,98,1024.0,286.0,3.6
3,2018-12-31 05:00:00,2018-12-31 04:00:00,7.8,0,0,0.0,14.0,98,1024.0,288.0,3.5
4,2018-12-31 06:00:00,2018-12-31 05:00:00,7.8,0,0,0.0,13.6,98,1024.0,291.0,3.4
5,2018-12-31 07:00:00,2018-12-31 06:00:00,7.7,0,0,0.0,13.1,98,1024.2,293.0,3.2


In [12]:
# Create new features(year, month, day) from start and end time of data collection  
df['YearPS'] = pd.to_datetime(df['PeriodStart']).dt.year
df['YearPE'] = pd.to_datetime(df['PeriodEnd']).dt.year

df['MonthPS'] = pd.to_datetime(df['PeriodStart']).dt.month
df['MonthPE'] = pd.to_datetime(df['PeriodEnd']).dt.month

df['HourPS'] = pd.to_datetime(df['PeriodStart']).dt.hour
df['HourPE'] = pd.to_datetime(df['PeriodEnd']).dt.hour

In [13]:
df.head(5)

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,YearPS,YearPE,MonthPS,MonthPE,HourPS,HourPE
0,2018-12-31 02:00:00,2018-12-31 01:00:00,8.2,0,0,0.0,15.5,98,1023.9,283.0,3.4,2018,2018,12,12,1,2
1,2018-12-31 03:00:00,2018-12-31 02:00:00,8.0,0,0,0.0,15.0,98,1024.0,284.0,3.6,2018,2018,12,12,2,3
2,2018-12-31 04:00:00,2018-12-31 03:00:00,7.9,0,0,0.0,14.5,98,1024.0,286.0,3.6,2018,2018,12,12,3,4
3,2018-12-31 05:00:00,2018-12-31 04:00:00,7.8,0,0,0.0,14.0,98,1024.0,288.0,3.5,2018,2018,12,12,4,5
4,2018-12-31 06:00:00,2018-12-31 05:00:00,7.8,0,0,0.0,13.6,98,1024.0,291.0,3.4,2018,2018,12,12,5,6


In [14]:
# create a new feature for sunrise using the function below; fill time for sunrise
def time_to_sunrise(row):
    """
    A function for identifying time of sunrise.
    """
    value= 0
    if row['HourPS'] == 4 and row['HourPE'] == 5 and row['Radiation'] > 0:
        value= '05:00:00'
    elif row['HourPS'] == 5 and row['HourPE'] == 6 and row['Radiation'] > 0:
        value= '06:00:00'
    return value

In [15]:
#Create a new column using the function above
df['Sunrise'] = df.apply(time_to_sunrise, axis=1)
df.head(5)

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,YearPS,YearPE,MonthPS,MonthPE,HourPS,HourPE,Sunrise
0,2018-12-31 02:00:00,2018-12-31 01:00:00,8.2,0,0,0.0,15.5,98,1023.9,283.0,3.4,2018,2018,12,12,1,2,0
1,2018-12-31 03:00:00,2018-12-31 02:00:00,8.0,0,0,0.0,15.0,98,1024.0,284.0,3.6,2018,2018,12,12,2,3,0
2,2018-12-31 04:00:00,2018-12-31 03:00:00,7.9,0,0,0.0,14.5,98,1024.0,286.0,3.6,2018,2018,12,12,3,4,0
3,2018-12-31 05:00:00,2018-12-31 04:00:00,7.8,0,0,0.0,14.0,98,1024.0,288.0,3.5,2018,2018,12,12,4,5,0
4,2018-12-31 06:00:00,2018-12-31 05:00:00,7.8,0,0,0.0,13.6,98,1024.0,291.0,3.4,2018,2018,12,12,5,6,0


In [16]:
# create a new feature for sunset using the function below; fill time for sunset
def time_to_sunset(row):
    """
    A function for identifying time of sunset.
    """
    value= 0
    if row['HourPS'] == 17 and row['HourPE'] == 18 and row['Radiation']> 0:
        value= '18:00:00'
    elif row['HourPS'] == 18 and row['HourPE']== 19 and row['Radiation']> 0:
        value= '19:00:00'
    return value

In [17]:
#Create a new column using the function above
df['Sunset'] = df.apply(time_to_sunset, axis=1)

In [18]:
df.Sunset.unique()

array([0, '18:00:00', '19:00:00'], dtype=object)

In [19]:
# show summary of our dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25989 entries, 0 to 25988
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PeriodEnd      25989 non-null  datetime64[ns]
 1   PeriodStart    25989 non-null  datetime64[ns]
 2   Temperature    25989 non-null  float64       
 3   DHI            25989 non-null  int64         
 4   DNI            25989 non-null  int64         
 5   Radiation      25989 non-null  float64       
 6   Precipitation  25989 non-null  float64       
 7   Humidity       25989 non-null  int32         
 8   Pressure       25989 non-null  float64       
 9   WindDirection  25989 non-null  float64       
 10  WindSpeed      25989 non-null  float64       
 11  YearPS         25989 non-null  int64         
 12  YearPE         25989 non-null  int64         
 13  MonthPS        25989 non-null  int64         
 14  MonthPE        25989 non-null  int64         
 15  HourPS         2598

In [20]:
# changing integer values to datetime format
df_new = df.loc[~((df['Radiation'] == 0) & (df['DNI'] == 0) & (df['DHI'] == 0)),:]

In [21]:
# show summary of our dataframe
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13915 entries, 7 to 25982
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PeriodEnd      13915 non-null  datetime64[ns]
 1   PeriodStart    13915 non-null  datetime64[ns]
 2   Temperature    13915 non-null  float64       
 3   DHI            13915 non-null  int64         
 4   DNI            13915 non-null  int64         
 5   Radiation      13915 non-null  float64       
 6   Precipitation  13915 non-null  float64       
 7   Humidity       13915 non-null  int32         
 8   Pressure       13915 non-null  float64       
 9   WindDirection  13915 non-null  float64       
 10  WindSpeed      13915 non-null  float64       
 11  YearPS         13915 non-null  int64         
 12  YearPE         13915 non-null  int64         
 13  MonthPS        13915 non-null  int64         
 14  MonthPE        13915 non-null  int64         
 15  HourPS         1391

In [22]:
# display 10 random samples of the dataframe
df_new.sample(10)

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,YearPS,YearPE,MonthPS,MonthPE,HourPS,HourPE,Sunrise,Sunset
10639,2020-03-18 09:00:00,2020-03-18 08:00:00,10.7,87,694,357.0,13.8,75,1020.9,225.0,5.5,2020,2020,3,3,8,9,0,0
3540,2019-05-27 14:00:00,2019-05-27 13:00:00,18.1,480,15,492.0,19.1,49,999.5,270.0,5.2,2019,2019,5,5,13,14,0,0
11050,2020-04-04 12:00:00,2020-04-04 11:00:00,13.2,382,251,558.0,10.8,49,1016.4,160.0,3.1,2020,2020,4,4,11,12,0,0
127,2019-01-05 09:00:00,2019-01-05 08:00:00,4.7,13,0,13.0,12.6,94,1023.8,316.0,5.1,2019,2019,1,1,8,9,0,0
8530,2019-12-21 12:00:00,2019-12-21 11:00:00,8.3,123,213,180.0,13.7,78,982.3,196.0,7.4,2019,2019,12,12,11,12,0,0
13763,2020-07-26 13:00:00,2020-07-26 12:00:00,22.2,330,83,398.0,20.5,51,1006.0,265.0,5.5,2020,2020,7,7,12,13,0,0
11609,2020-04-27 19:00:00,2020-04-27 18:00:00,15.4,20,12,21.0,22.3,67,999.5,354.0,1.9,2020,2020,4,4,18,19,0,19:00:00
13693,2020-07-23 15:00:00,2020-07-23 14:00:00,24.6,210,597,636.0,27.0,39,1008.6,233.0,3.2,2020,2020,7,7,14,15,0,0
5825,2019-08-30 19:00:00,2019-08-30 18:00:00,19.0,6,48,10.0,17.9,45,1014.2,150.0,0.9,2019,2019,8,8,18,19,0,19:00:00
12135,2020-05-19 17:00:00,2020-05-19 16:00:00,20.7,89,675,383.0,25.4,58,1015.2,325.0,4.4,2020,2020,5,5,16,17,0,0


In [23]:
# convert datetime column of endperiod to just date
df_new['Date'] = pd.to_datetime(df_new['PeriodEnd']).dt.normalize()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Date'] = pd.to_datetime(df_new['PeriodEnd']).dt.normalize()


In [24]:
# convert the column (it's a string) to datetime type
dt_series = pd.to_datetime(df_new['Date'])

# create datetime index passing the datetime series
dt_index = pd.DatetimeIndex(dt_series.values)

# 
df_new1 = df_new.set_index(dt_index)

In [25]:
# display first 5 rows
df_new1.head()

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,YearPS,YearPE,MonthPS,MonthPE,HourPS,HourPE,Sunrise,Sunset,Date
2018-12-31,2018-12-31 09:00:00,2018-12-31 08:00:00,7.5,10,0,10.0,12.1,98,1025.0,293.0,2.7,2018,2018,12,12,8,9,0,0,2018-12-31
2018-12-31,2018-12-31 10:00:00,2018-12-31 09:00:00,7.8,41,0,41.0,11.6,97,1025.1,296.0,2.6,2018,2018,12,12,9,10,0,0,2018-12-31
2018-12-31,2018-12-31 11:00:00,2018-12-31 10:00:00,8.4,61,0,61.0,11.2,96,1025.0,300.0,2.9,2018,2018,12,12,10,11,0,0,2018-12-31
2018-12-31,2018-12-31 12:00:00,2018-12-31 11:00:00,8.9,49,0,49.0,10.9,94,1024.8,304.0,3.2,2018,2018,12,12,11,12,0,0,2018-12-31
2018-12-31,2018-12-31 13:00:00,2018-12-31 12:00:00,8.9,37,0,37.0,10.5,93,1024.7,302.0,3.0,2018,2018,12,12,12,13,0,0,2018-12-31


In [26]:
# 
df_new1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13915 entries, 2018-12-31 to 2021-12-17
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PeriodEnd      13915 non-null  datetime64[ns]
 1   PeriodStart    13915 non-null  datetime64[ns]
 2   Temperature    13915 non-null  float64       
 3   DHI            13915 non-null  int64         
 4   DNI            13915 non-null  int64         
 5   Radiation      13915 non-null  float64       
 6   Precipitation  13915 non-null  float64       
 7   Humidity       13915 non-null  int32         
 8   Pressure       13915 non-null  float64       
 9   WindDirection  13915 non-null  float64       
 10  WindSpeed      13915 non-null  float64       
 11  YearPS         13915 non-null  int64         
 12  YearPE         13915 non-null  int64         
 13  MonthPS        13915 non-null  int64         
 14  MonthPE        13915 non-null  int64         
 15  Ho

In [27]:
# sample daily average for the solar and weather features
df_new1['Daily_radiation'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Radiation'].mean()
df_new1['Daily_DNI'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['DNI'].mean()
df_new1['Daily_DHI'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['DHI'].mean()
df_new1['Daily_Temp'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Temperature'].mean()
df_new1['Daily_Precip'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Precipitation'].mean()
df_new1['Daily_Humidity'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Humidity'].mean()
df_new1['Daily_Pressure'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Pressure'].mean()
df_new1['Daily_WindDir'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['WindDirection'].mean()
df_new1['Daily_WindSpeed'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['WindSpeed'].mean()

In [28]:
# display a summary of our dataframe
df_new1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13915 entries, 2018-12-31 to 2021-12-17
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   PeriodEnd        13915 non-null  datetime64[ns]
 1   PeriodStart      13915 non-null  datetime64[ns]
 2   Temperature      13915 non-null  float64       
 3   DHI              13915 non-null  int64         
 4   DNI              13915 non-null  int64         
 5   Radiation        13915 non-null  float64       
 6   Precipitation    13915 non-null  float64       
 7   Humidity         13915 non-null  int32         
 8   Pressure         13915 non-null  float64       
 9   WindDirection    13915 non-null  float64       
 10  WindSpeed        13915 non-null  float64       
 11  YearPS           13915 non-null  int64         
 12  YearPE           13915 non-null  int64         
 13  MonthPS          13915 non-null  int64         
 14  MonthPE          1391

In [29]:
# drop irrelevant features
new_df= df_new1.drop(['PeriodStart', 'YearPS', 'HourPS', 'HourPE', 'Sunrise', 'Sunset', 'MonthPS', 'YearPE'], axis= 1)

In [30]:
# display first 10 rows
new_df.head(10)

Unnamed: 0,PeriodEnd,Temperature,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,...,Date,Daily_radiation,Daily_DNI,Daily_DHI,Daily_Temp,Daily_Precip,Daily_Humidity,Daily_Pressure,Daily_WindDir,Daily_WindSpeed
2018-12-31,2018-12-31 09:00:00,7.5,10,0,10.0,12.1,98,1025.0,293.0,2.7,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2018-12-31,2018-12-31 10:00:00,7.8,41,0,41.0,11.6,97,1025.1,296.0,2.6,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2018-12-31,2018-12-31 11:00:00,8.4,61,0,61.0,11.2,96,1025.0,300.0,2.9,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2018-12-31,2018-12-31 12:00:00,8.9,49,0,49.0,10.9,94,1024.8,304.0,3.2,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2018-12-31,2018-12-31 13:00:00,8.9,37,0,37.0,10.5,93,1024.7,302.0,3.0,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2018-12-31,2018-12-31 14:00:00,8.2,43,0,43.0,10.1,94,1024.5,294.0,2.4,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2018-12-31,2018-12-31 15:00:00,7.6,15,0,15.0,9.7,95,1024.4,282.0,1.9,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2018-12-31,2018-12-31 16:00:00,7.0,6,0,6.0,9.5,95,1024.3,269.0,1.9,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2019-01-01,2019-01-01 09:00:00,6.2,13,0,13.0,12.7,94,1021.2,274.0,5.1,...,2019-01-01,37.0,9.75,36.125,6.8625,12.2125,94.25,1020.6875,290.875,5.4375
2019-01-01,2019-01-01 10:00:00,6.4,32,0,32.0,13.0,94,1021.1,276.0,5.2,...,2019-01-01,37.0,9.75,36.125,6.8625,12.2125,94.25,1020.6875,290.875,5.4375


In [31]:
# show a summary of the dataframe
new_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 13915 entries, 2018-12-31 to 2021-12-17
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   PeriodEnd        13915 non-null  datetime64[ns]
 1   Temperature      13915 non-null  float64       
 2   DHI              13915 non-null  int64         
 3   DNI              13915 non-null  int64         
 4   Radiation        13915 non-null  float64       
 5   Precipitation    13915 non-null  float64       
 6   Humidity         13915 non-null  int32         
 7   Pressure         13915 non-null  float64       
 8   WindDirection    13915 non-null  float64       
 9   WindSpeed        13915 non-null  float64       
 10  MonthPE          13915 non-null  int64         
 11  Date             13915 non-null  datetime64[ns]
 12  Daily_radiation  13915 non-null  float64       
 13  Daily_DNI        13915 non-null  float64       
 14  Daily_DHI        1391

In [32]:
# dropping ALL duplicate values exceept the last value
new_df = new_df[~new_df.Date.duplicated(keep = 'last')]

In [33]:
# show the number of rows and columns available
new_df.shape

(1083, 21)

In [34]:
# 
new_df.head(10)

Unnamed: 0,PeriodEnd,Temperature,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,...,Date,Daily_radiation,Daily_DNI,Daily_DHI,Daily_Temp,Daily_Precip,Daily_Humidity,Daily_Pressure,Daily_WindDir,Daily_WindSpeed
2018-12-31,2018-12-31 16:00:00,7.0,6,0,6.0,9.5,95,1024.3,269.0,1.9,...,2018-12-31,32.75,0.0,32.75,8.0375,10.7,95.25,1024.725,292.5,2.575
2019-01-01,2019-01-01 16:00:00,6.8,9,59,14.0,10.6,93,1020.7,315.0,5.8,...,2019-01-01,37.0,9.75,36.125,6.8625,12.2125,94.25,1020.6875,290.875,5.4375
2019-01-02,2019-01-02 16:00:00,3.2,10,33,13.0,7.8,72,1028.6,351.0,3.6,...,2019-01-02,48.555556,8.333333,47.666667,4.188889,7.6,73.444444,1027.588889,345.111111,4.544444
2019-01-03,2019-01-03 16:00:00,4.1,9,0,9.0,9.3,79,1029.6,352.0,2.6,...,2019-01-03,50.555556,3.333333,50.333333,3.555556,8.977778,80.333333,1029.488889,342.888889,2.533333
2019-01-04,2019-01-04 16:00:00,3.5,6,0,6.0,9.3,79,1027.2,303.0,4.6,...,2019-01-04,53.25,2.75,52.5,3.1875,9.025,83.25,1028.2375,314.625,4.05
2019-01-05,2019-01-05 16:00:00,4.0,10,0,10.0,8.8,92,1023.9,311.0,5.3,...,2019-01-05,32.5,0.0,32.5,4.95,10.425,93.375,1023.825,315.0,5.575
2019-01-06,2019-01-06 16:00:00,3.7,5,1,5.0,10.0,87,1024.2,305.0,1.6,...,2019-01-06,37.25,6.125,36.625,4.225,9.6,89.0,1024.2,306.375,2.7
2019-01-07,2019-01-07 16:00:00,6.6,5,0,5.0,19.7,92,1019.5,237.0,6.0,...,2019-01-07,42.25,0.0,42.25,5.3875,16.975,93.875,1021.95,242.0,4.725
2019-01-08,2019-01-08 16:00:00,4.8,8,0,8.0,10.1,83,1010.3,317.0,8.9,...,2019-01-08,62.0,16.5,57.5,5.8125,10.4375,82.375,1010.2875,310.0,9.0
2019-01-09,2019-01-09 16:00:00,3.2,15,48,19.0,6.7,76,1012.1,354.0,6.2,...,2019-01-09,115.25,124.375,85.0,3.975,7.25,80.875,1011.2375,350.5,6.625


In [35]:
# list out the column names
new_df.columns

Index(['PeriodEnd', 'Temperature', 'DHI', 'DNI', 'Radiation', 'Precipitation',
       'Humidity', 'Pressure', 'WindDirection', 'WindSpeed', 'MonthPE', 'Date',
       'Daily_radiation', 'Daily_DNI', 'Daily_DHI', 'Daily_Temp',
       'Daily_Precip', 'Daily_Humidity', 'Daily_Pressure', 'Daily_WindDir',
       'Daily_WindSpeed'],
      dtype='object')

In [36]:
# select column features
final_df= new_df[['MonthPE', 'Date','Daily_Temp','Daily_Precip', 'Daily_Humidity', 'Daily_Pressure',\
                  'Daily_WindDir','Daily_WindSpeed','Daily_DNI', 'Daily_DHI','Daily_radiation']]

In [37]:
# print the first 5 rows
final_df.head()

Unnamed: 0,MonthPE,Date,Daily_Temp,Daily_Precip,Daily_Humidity,Daily_Pressure,Daily_WindDir,Daily_WindSpeed,Daily_DNI,Daily_DHI,Daily_radiation
2018-12-31,12,2018-12-31,8.0375,10.7,95.25,1024.725,292.5,2.575,0.0,32.75,32.75
2019-01-01,1,2019-01-01,6.8625,12.2125,94.25,1020.6875,290.875,5.4375,9.75,36.125,37.0
2019-01-02,1,2019-01-02,4.188889,7.6,73.444444,1027.588889,345.111111,4.544444,8.333333,47.666667,48.555556
2019-01-03,1,2019-01-03,3.555556,8.977778,80.333333,1029.488889,342.888889,2.533333,3.333333,50.333333,50.555556
2019-01-04,1,2019-01-04,3.1875,9.025,83.25,1028.2375,314.625,4.05,2.75,52.5,53.25


In [38]:
# show the number of rows and column
final_df.shape

(1083, 11)

In [39]:
# save cleaned dataset
data_save_path = '../data/clean/'
data_save_name = 'cleaned_solar_irradiation.csv'

final_df.to_csv(data_save_path + data_save_name)