# Data cleaning and feature engineering

In [78]:
# import libraries
import numpy as np
import pandas as pd

In [79]:
# read the dataframe
df = pd.read_csv('/home/josephitopa/Documents/personal_files/ufuoma/data/raw_data/5.866302_7.509482_Solcast_PT60M.csv')

In [80]:
# printout the column names
df.columns

Index(['PeriodEnd', 'PeriodStart', 'Period', 'AirTemp', 'Azimuth',
       'CloudOpacity', 'DewpointTemp', 'Dhi', 'Dni', 'Ghi',
       'PrecipitableWater', 'RelativeHumidity', 'SnowWater', 'SurfacePressure',
       'WindDirection10m', 'WindSpeed10m', 'Zenith'],
      dtype='object')

In [81]:
# select some features
df = df[['PeriodEnd', 'PeriodStart', 'AirTemp', 'CloudOpacity', 'Dhi', 'Dni', 'Ghi',
       'PrecipitableWater', 'RelativeHumidity', 'SurfacePressure',
       'WindDirection10m', 'WindSpeed10m']]

In [82]:
# randomly display 10 rows
df.sample(10)

Unnamed: 0,PeriodEnd,PeriodStart,AirTemp,CloudOpacity,Dhi,Dni,Ghi,PrecipitableWater,RelativeHumidity,SurfacePressure,WindDirection10m,WindSpeed10m
78142,2015-12-01T00:00:00Z,2015-11-30T23:00:00Z,24.6,20.9,0,0,0,49.1,94.7,996.7,216,2.4
78336,2015-12-09T02:00:00Z,2015-12-09T01:00:00Z,22.3,0.0,0,0,0,33.3,81.6,996.0,144,1.5
29217,2010-05-02T11:00:00Z,2010-05-02T10:00:00Z,27.9,6.8,319,566,862,60.0,85.4,995.6,235,1.8
96196,2017-12-22T06:00:00Z,2017-12-22T05:00:00Z,24.2,49.7,4,0,4,46.0,92.5,994.9,229,2.6
113588,2019-12-16T22:00:00Z,2019-12-16T21:00:00Z,25.0,0.0,0,0,0,28.5,36.7,995.1,210,1.6
10348,2008-03-07T06:00:00Z,2008-03-07T05:00:00Z,23.7,42.8,3,0,3,50.8,96.0,991.6,202,2.6
10863,2008-03-28T17:00:00Z,2008-03-28T16:00:00Z,27.4,40.4,104,0,104,51.4,76.9,989.7,218,2.2
15932,2008-10-25T22:00:00Z,2008-10-25T21:00:00Z,25.0,0.0,0,0,0,42.2,91.0,995.9,192,3.0
82320,2016-05-23T02:00:00Z,2016-05-23T01:00:00Z,23.5,96.4,0,0,0,62.0,97.4,998.6,256,2.4
34937,2010-12-26T19:00:00Z,2010-12-26T18:00:00Z,27.4,0.0,0,0,0,40.9,68.8,991.9,215,2.2


In [83]:
old_names = ['PeriodEnd', 'PeriodStart', 'AirTemp', 'CloudOpacity', 'Dhi', 'Dni', 'Ghi','PrecipitableWater', 'RelativeHumidity',
             'SurfacePressure', 'WindDirection10m', 'WindSpeed10m'] 
new_names = ['PeriodEnd','PeriodStart','Temperature', 'CloudOpacity', 'DHI', 'DNI', 'Radiation', 'Precipitation', 'Humidity', 
             'Pressure', 'WindDirection', 'WindSpeed']
df.rename(columns = dict(zip(old_names, new_names)), inplace = True)

In [84]:
# display dataframe
df.head()

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,CloudOpacity,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed
0,2007-01-01T02:00:00Z,2007-01-01T01:00:00Z,21.0,0.0,0,0,0,24.5,52.5,997.0,41,1.8
1,2007-01-01T03:00:00Z,2007-01-01T02:00:00Z,20.5,0.0,0,0,0,24.1,49.5,996.7,36,1.9
2,2007-01-01T04:00:00Z,2007-01-01T03:00:00Z,20.2,0.0,0,0,0,23.7,46.1,996.8,33,2.0
3,2007-01-01T05:00:00Z,2007-01-01T04:00:00Z,20.0,0.0,0,0,0,23.4,42.3,997.2,32,2.4
4,2007-01-01T06:00:00Z,2007-01-01T05:00:00Z,19.9,0.0,3,0,3,23.2,38.8,997.5,32,2.7


In [85]:
# show a summary of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133558 entries, 0 to 133557
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   PeriodEnd      133558 non-null  object 
 1   PeriodStart    133558 non-null  object 
 2   Temperature    133558 non-null  float64
 3   CloudOpacity   133558 non-null  float64
 4   DHI            133558 non-null  int64  
 5   DNI            133558 non-null  int64  
 6   Radiation      133558 non-null  int64  
 7   Precipitation  133558 non-null  float64
 8   Humidity       133558 non-null  float64
 9   Pressure       133558 non-null  float64
 10  WindDirection  133558 non-null  int64  
 11  WindSpeed      133558 non-null  float64
dtypes: float64(6), int64(4), object(2)
memory usage: 12.2+ MB


In [86]:
 # interpret columns as appropriate data types to ensure compatibility
df['Radiation']     = df['Radiation'].astype(float)
df['Temperature']   = df['Temperature'].astype(float) # or int
df['Pressure']      = df['Pressure'].astype(float)
df['CloudOpacity']      = df['CloudOpacity'].astype(float)
df['Humidity']      = df['Humidity'].astype(int) # or int
df['WindDirection'] = df['WindDirection'].astype(float)
df['WindSpeed']     = df['WindSpeed'].astype(float)
df['PeriodStart'] = pd.to_datetime(df['PeriodStart']).dt.to_period('T').dt.to_timestamp()
df['PeriodEnd'] = pd.to_datetime(df['PeriodEnd']).dt.to_period('T').dt.to_timestamp()



In [87]:
# display dataframe summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133558 entries, 0 to 133557
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   PeriodEnd      133558 non-null  datetime64[ns]
 1   PeriodStart    133558 non-null  datetime64[ns]
 2   Temperature    133558 non-null  float64       
 3   CloudOpacity   133558 non-null  float64       
 4   DHI            133558 non-null  int64         
 5   DNI            133558 non-null  int64         
 6   Radiation      133558 non-null  float64       
 7   Precipitation  133558 non-null  float64       
 8   Humidity       133558 non-null  int64         
 9   Pressure       133558 non-null  float64       
 10  WindDirection  133558 non-null  float64       
 11  WindSpeed      133558 non-null  float64       
dtypes: datetime64[ns](2), float64(7), int64(3)
memory usage: 12.2 MB


In [88]:
# display top 6 rows
df.head()

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,CloudOpacity,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed
0,2007-01-01 02:00:00,2007-01-01 01:00:00,21.0,0.0,0,0,0.0,24.5,52,997.0,41.0,1.8
1,2007-01-01 03:00:00,2007-01-01 02:00:00,20.5,0.0,0,0,0.0,24.1,49,996.7,36.0,1.9
2,2007-01-01 04:00:00,2007-01-01 03:00:00,20.2,0.0,0,0,0.0,23.7,46,996.8,33.0,2.0
3,2007-01-01 05:00:00,2007-01-01 04:00:00,20.0,0.0,0,0,0.0,23.4,42,997.2,32.0,2.4
4,2007-01-01 06:00:00,2007-01-01 05:00:00,19.9,0.0,3,0,3.0,23.2,38,997.5,32.0,2.7


In [89]:
# Create new features(year, month, day) from start and end time of data collection  
df['YearPS'] = pd.to_datetime(df['PeriodStart']).dt.year
df['YearPE'] = pd.to_datetime(df['PeriodEnd']).dt.year

df['MonthPS'] = pd.to_datetime(df['PeriodStart']).dt.month
df['MonthPE'] = pd.to_datetime(df['PeriodEnd']).dt.month

df['HourPS'] = pd.to_datetime(df['PeriodStart']).dt.hour
df['HourPE'] = pd.to_datetime(df['PeriodEnd']).dt.hour

In [90]:
df.head(5)

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,CloudOpacity,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,YearPS,YearPE,MonthPS,MonthPE,HourPS,HourPE
0,2007-01-01 02:00:00,2007-01-01 01:00:00,21.0,0.0,0,0,0.0,24.5,52,997.0,41.0,1.8,2007,2007,1,1,1,2
1,2007-01-01 03:00:00,2007-01-01 02:00:00,20.5,0.0,0,0,0.0,24.1,49,996.7,36.0,1.9,2007,2007,1,1,2,3
2,2007-01-01 04:00:00,2007-01-01 03:00:00,20.2,0.0,0,0,0.0,23.7,46,996.8,33.0,2.0,2007,2007,1,1,3,4
3,2007-01-01 05:00:00,2007-01-01 04:00:00,20.0,0.0,0,0,0.0,23.4,42,997.2,32.0,2.4,2007,2007,1,1,4,5
4,2007-01-01 06:00:00,2007-01-01 05:00:00,19.9,0.0,3,0,3.0,23.2,38,997.5,32.0,2.7,2007,2007,1,1,5,6


In [91]:
# create a new feature for sunrise using the function below; fill time for sunrise
def time_to_sunrise(row):
    """
    A function for identifying time of sunrise.
    """
    value= 0
    if row['HourPS'] == 4 and row['HourPE'] == 5 and row['Radiation'] > 0:
        value= '05:00:00'
    elif row['HourPS'] == 5 and row['HourPE'] == 6 and row['Radiation'] > 0:
        value= '06:00:00'
    return value

In [92]:
#Create a new column using the function above
df['Sunrise'] = df.apply(time_to_sunrise, axis = 1)
df.head(5)

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,CloudOpacity,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,YearPS,YearPE,MonthPS,MonthPE,HourPS,HourPE,Sunrise
0,2007-01-01 02:00:00,2007-01-01 01:00:00,21.0,0.0,0,0,0.0,24.5,52,997.0,41.0,1.8,2007,2007,1,1,1,2,0
1,2007-01-01 03:00:00,2007-01-01 02:00:00,20.5,0.0,0,0,0.0,24.1,49,996.7,36.0,1.9,2007,2007,1,1,2,3,0
2,2007-01-01 04:00:00,2007-01-01 03:00:00,20.2,0.0,0,0,0.0,23.7,46,996.8,33.0,2.0,2007,2007,1,1,3,4,0
3,2007-01-01 05:00:00,2007-01-01 04:00:00,20.0,0.0,0,0,0.0,23.4,42,997.2,32.0,2.4,2007,2007,1,1,4,5,0
4,2007-01-01 06:00:00,2007-01-01 05:00:00,19.9,0.0,3,0,3.0,23.2,38,997.5,32.0,2.7,2007,2007,1,1,5,6,06:00:00


In [93]:
df.Sunrise.unique()

array([0, '06:00:00'], dtype=object)

In [94]:
# create a new feature for sunset using the function below; fill time for sunset
def time_to_sunset(row):
    """
    A function for identifying time of sunset.
    """
    value= 0
    if row['HourPS'] == 17 and row['HourPE'] == 18 and row['Radiation']> 0:
        value= '18:00:00'
    elif row['HourPS'] == 18 and row['HourPE']== 19 and row['Radiation']> 0:
        value= '19:00:00'
    return value

In [95]:
#Create a new column using the function above
df['Sunset'] = df.apply(time_to_sunset, axis = 1)

In [96]:
df.Sunset.unique()

array([0, '18:00:00'], dtype=object)

In [97]:
# show summary of our dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133558 entries, 0 to 133557
Data columns (total 20 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   PeriodEnd      133558 non-null  datetime64[ns]
 1   PeriodStart    133558 non-null  datetime64[ns]
 2   Temperature    133558 non-null  float64       
 3   CloudOpacity   133558 non-null  float64       
 4   DHI            133558 non-null  int64         
 5   DNI            133558 non-null  int64         
 6   Radiation      133558 non-null  float64       
 7   Precipitation  133558 non-null  float64       
 8   Humidity       133558 non-null  int64         
 9   Pressure       133558 non-null  float64       
 10  WindDirection  133558 non-null  float64       
 11  WindSpeed      133558 non-null  float64       
 12  YearPS         133558 non-null  int64         
 13  YearPE         133558 non-null  int64         
 14  MonthPS        133558 non-null  int64         
 15  

In [98]:
# changing integer values to datetime format
df_new = df.loc[~((df['Radiation'] == 0) & (df['DNI'] == 0) & (df['DHI'] == 0)),:]

In [99]:
# show summary of our dataframe
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72037 entries, 4 to 133552
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PeriodEnd      72037 non-null  datetime64[ns]
 1   PeriodStart    72037 non-null  datetime64[ns]
 2   Temperature    72037 non-null  float64       
 3   CloudOpacity   72037 non-null  float64       
 4   DHI            72037 non-null  int64         
 5   DNI            72037 non-null  int64         
 6   Radiation      72037 non-null  float64       
 7   Precipitation  72037 non-null  float64       
 8   Humidity       72037 non-null  int64         
 9   Pressure       72037 non-null  float64       
 10  WindDirection  72037 non-null  float64       
 11  WindSpeed      72037 non-null  float64       
 12  YearPS         72037 non-null  int64         
 13  YearPE         72037 non-null  int64         
 14  MonthPS        72037 non-null  int64         
 15  MonthPE        720

In [100]:
# display 10 random samples of the dataframe
df_new.sample(10)

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,CloudOpacity,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,YearPS,YearPE,MonthPS,MonthPE,HourPS,HourPE,Sunrise,Sunset
111849,2019-10-05 11:00:00,2019-10-05 10:00:00,25.0,9.2,337,554,872.0,54.5,89,997.4,197.0,1.4,2019,2019,10,10,10,11,0,0
97804,2018-02-27 06:00:00,2018-02-27 05:00:00,23.4,84.5,1,0,1.0,46.6,92,995.2,188.0,2.3,2018,2018,2,2,5,6,06:00:00,0
112354,2019-10-26 12:00:00,2019-10-26 11:00:00,28.0,34.5,557,74,627.0,58.0,78,996.7,32.0,2.0,2019,2019,10,10,11,12,0,0
29624,2010-05-19 10:00:00,2010-05-19 09:00:00,27.4,4.1,237,638,778.0,56.5,82,993.5,201.0,2.8,2010,2010,5,5,9,10,0,0
40613,2011-08-20 07:00:00,2011-08-20 06:00:00,22.6,9.5,98,215,166.0,50.8,97,996.5,256.0,1.1,2011,2011,8,8,6,7,0,0
44795,2012-02-10 13:00:00,2012-02-10 12:00:00,28.6,30.5,493,0,493.0,49.5,76,992.6,204.0,0.4,2012,2012,2,2,12,13,0,0
103493,2018-10-22 07:00:00,2018-10-22 06:00:00,24.9,22.1,156,1,156.0,52.4,94,992.9,222.0,1.8,2018,2018,10,10,6,7,0,0
9925,2008-02-18 15:00:00,2008-02-18 14:00:00,36.6,0.0,382,191,518.0,15.2,9,990.6,28.0,2.1,2008,2008,2,2,14,15,0,0
126614,2021-06-11 16:00:00,2021-06-11 15:00:00,27.6,50.5,199,0,199.0,53.5,74,993.6,228.0,2.5,2021,2021,6,6,15,16,0,0
6223,2007-09-17 09:00:00,2007-09-17 08:00:00,24.2,33.0,429,39,456.0,53.5,93,996.1,202.0,1.7,2007,2007,9,9,8,9,0,0


In [101]:
# convert datetime column of endperiod to just date
df_new['Date'] = pd.to_datetime(df_new['PeriodEnd']).dt.normalize()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Date'] = pd.to_datetime(df_new['PeriodEnd']).dt.normalize()


In [102]:
# convert the column (it's a string) to datetime type
dt_series = pd.to_datetime(df_new['Date'])

# create datetime index passing the datetime series
dt_index = pd.DatetimeIndex(dt_series.values)

# 
df_new1 = df_new.set_index(dt_index)

In [103]:
# display first 5 rows
df_new1.head()

Unnamed: 0,PeriodEnd,PeriodStart,Temperature,CloudOpacity,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,...,WindSpeed,YearPS,YearPE,MonthPS,MonthPE,HourPS,HourPE,Sunrise,Sunset,Date
2007-01-01,2007-01-01 06:00:00,2007-01-01 05:00:00,19.9,0.0,3,0,3.0,23.2,38,997.5,...,2.7,2007,2007,1,1,5,6,06:00:00,0,2007-01-01
2007-01-01,2007-01-01 07:00:00,2007-01-01 06:00:00,20.8,0.0,67,0,67.0,23.1,35,997.9,...,3.0,2007,2007,1,1,6,7,0,0,2007-01-01
2007-01-01,2007-01-01 08:00:00,2007-01-01 07:00:00,22.6,0.0,193,25,205.0,23.3,32,998.2,...,3.3,2007,2007,1,1,7,8,0,0,2007-01-01
2007-01-01,2007-01-01 09:00:00,2007-01-01 08:00:00,24.5,0.0,320,114,390.0,23.4,29,998.5,...,3.7,2007,2007,1,1,8,9,0,0,2007-01-01
2007-01-01,2007-01-01 10:00:00,2007-01-01 09:00:00,26.5,0.0,396,253,586.0,23.8,25,998.3,...,3.7,2007,2007,1,1,9,10,0,0,2007-01-01


In [104]:
# 
df_new1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 72037 entries, 2007-01-01 to 2022-03-27
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PeriodEnd      72037 non-null  datetime64[ns]
 1   PeriodStart    72037 non-null  datetime64[ns]
 2   Temperature    72037 non-null  float64       
 3   CloudOpacity   72037 non-null  float64       
 4   DHI            72037 non-null  int64         
 5   DNI            72037 non-null  int64         
 6   Radiation      72037 non-null  float64       
 7   Precipitation  72037 non-null  float64       
 8   Humidity       72037 non-null  int64         
 9   Pressure       72037 non-null  float64       
 10  WindDirection  72037 non-null  float64       
 11  WindSpeed      72037 non-null  float64       
 12  YearPS         72037 non-null  int64         
 13  YearPE         72037 non-null  int64         
 14  MonthPS        72037 non-null  int64         
 15  Mo

In [106]:
df_new1['Date'] = pd.to_datetime(df_new1['PeriodEnd']).dt.date

for col in ['Sunrise', 'Sunset']:
    df_new1[col] = df_new1[col].astype(str)
    
agg_dict = {
    'Temperature':'mean',
    'CloudOpacity':'mean',
    'DHI':'mean',
    'DNI':'mean',
    'Radiation':'mean',
    'Precipitation':'mean',
    'Humidity':'mean',
    'Pressure':'mean',
    'WindDirection':'mean',
    'WindSpeed':'mean',
    'Sunrise':'max',
    'Sunset':'max',
}

df_grouped = df_new1.groupby(['Date']).agg(agg_dict)
df_grouped.reset_index(inplace = True)
df_grouped

Unnamed: 0,Date,Temperature,CloudOpacity,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,Sunrise,Sunset
0,2007-01-01,27.607692,0.969231,264.384615,130.307692,363.076923,24.838462,24.923077,996.446154,43.538462,2.384615,06:00:00,18:00:00
1,2007-01-02,27.753846,0.000000,264.153846,131.923077,363.076923,22.800000,24.384615,996.369231,84.615385,2.323077,06:00:00,18:00:00
2,2007-01-03,27.330769,0.000000,238.923077,228.615385,404.769231,21.838462,27.461538,995.376923,85.230769,1.592308,06:00:00,18:00:00
3,2007-01-04,27.946154,0.107692,242.769231,229.461538,406.384615,22.338462,37.769231,994.684615,73.923077,1.715385,06:00:00,18:00:00
4,2007-01-05,28.692308,0.000000,211.692308,311.692308,435.461538,21.953846,28.769231,994.038462,77.538462,2.161538,06:00:00,18:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5560,2022-03-23,31.761538,45.961538,304.307692,8.153846,312.076923,47.423077,56.307692,994.484615,213.307692,2.507692,06:00:00,18:00:00
5561,2022-03-24,31.892308,23.030769,318.615385,110.153846,408.000000,49.730769,55.230769,993.707692,208.076923,1.961538,06:00:00,18:00:00
5562,2022-03-25,33.161538,49.815385,243.230769,0.000000,243.230769,46.707692,50.846154,993.076923,170.307692,1.638462,06:00:00,18:00:00
5563,2022-03-26,32.907692,12.561538,323.769231,131.153846,424.153846,43.646154,47.846154,993.323077,203.384615,2.123077,06:00:00,18:00:00


In [111]:
df_grouped['Month'] = pd.to_datetime(df_grouped['Date']).dt.month

In [112]:
"""
# sample daily average for the solar and weather features
df_new1['Daily_radiation'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Radiation'].mean()
df_new1['Daily_DNI'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['DNI'].mean()
df_new1['Daily_DHI'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['DHI'].mean()
df_new1['Daily_Temp'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Temperature'].mean()
df_new1['Daily_Precip'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Precipitation'].mean()
df_new1['Daily_Humidity'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Humidity'].mean()
df_new1['Daily_Pressure'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Pressure'].mean()
df_new1['Daily_WindDir'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['WindDirection'].mean()
df_new1['Daily_WindSpeed'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['WindSpeed'].mean()

df_grouped['Sunrise'] = df_grouped['Sunrise'].astype(int) 
df_grouped['Sunset'] = df_grouped['Sunset'].astype(int) 
"""

"\n# sample daily average for the solar and weather features\ndf_new1['Daily_radiation'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Radiation'].mean()\ndf_new1['Daily_DNI'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['DNI'].mean()\ndf_new1['Daily_DHI'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['DHI'].mean()\ndf_new1['Daily_Temp'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Temperature'].mean()\ndf_new1['Daily_Precip'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Precipitation'].mean()\ndf_new1['Daily_Humidity'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Humidity'].mean()\ndf_new1['Daily_Pressure'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['Pressure'].mean()\ndf_new1['Daily_WindDir'] = df_new1.reset_index().groupby(pd.Grouper(key='Date', freq='1D'))['WindDirection'].mean()\ndf_new1['Daily_WindSpeed'] = df_new1.reset_ind

In [113]:
# display a summary of our dataframe
df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5565 entries, 0 to 5564
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           5565 non-null   object 
 1   Temperature    5565 non-null   float64
 2   CloudOpacity   5565 non-null   float64
 3   DHI            5565 non-null   float64
 4   DNI            5565 non-null   float64
 5   Radiation      5565 non-null   float64
 6   Precipitation  5565 non-null   float64
 7   Humidity       5565 non-null   float64
 8   Pressure       5565 non-null   float64
 9   WindDirection  5565 non-null   float64
 10  WindSpeed      5565 non-null   float64
 11  Sunrise        5565 non-null   object 
 12  Sunset         5565 non-null   object 
 13  Month          5565 non-null   int64  
dtypes: float64(10), int64(1), object(3)
memory usage: 608.8+ KB


In [115]:
# drop irrelevant features
new_df = df_grouped.drop(['Sunrise', 'Sunset'], axis = 1)

In [116]:
# display first 10 rows
new_df.head(10)

Unnamed: 0,Date,Temperature,CloudOpacity,DHI,DNI,Radiation,Precipitation,Humidity,Pressure,WindDirection,WindSpeed,Month
0,2007-01-01,27.607692,0.969231,264.384615,130.307692,363.076923,24.838462,24.923077,996.446154,43.538462,2.384615,1
1,2007-01-02,27.753846,0.0,264.153846,131.923077,363.076923,22.8,24.384615,996.369231,84.615385,2.323077,1
2,2007-01-03,27.330769,0.0,238.923077,228.615385,404.769231,21.838462,27.461538,995.376923,85.230769,1.592308,1
3,2007-01-04,27.946154,0.107692,242.769231,229.461538,406.384615,22.338462,37.769231,994.684615,73.923077,1.715385,1
4,2007-01-05,28.692308,0.0,211.692308,311.692308,435.461538,21.953846,28.769231,994.038462,77.538462,2.161538,1
5,2007-01-06,28.707692,0.046154,234.384615,245.384615,410.769231,20.915385,32.0,994.561538,25.384615,1.946154,1
6,2007-01-07,28.653846,0.269231,276.230769,89.615385,346.769231,21.107692,27.461538,994.046154,68.538462,1.753846,1
7,2007-01-08,28.623077,0.261538,263.923077,147.846154,372.692308,22.692308,34.846154,993.415385,19.692308,2.315385,1
8,2007-01-09,28.984615,0.0,275.538462,113.846154,359.846154,21.184615,24.769231,993.223077,33.615385,1.715385,1
9,2007-01-10,29.061538,0.069231,239.0,246.769231,418.0,18.130769,24.230769,993.784615,39.923077,2.192308,1


In [117]:
# show a summary of the dataframe
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5565 entries, 0 to 5564
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           5565 non-null   object 
 1   Temperature    5565 non-null   float64
 2   CloudOpacity   5565 non-null   float64
 3   DHI            5565 non-null   float64
 4   DNI            5565 non-null   float64
 5   Radiation      5565 non-null   float64
 6   Precipitation  5565 non-null   float64
 7   Humidity       5565 non-null   float64
 8   Pressure       5565 non-null   float64
 9   WindDirection  5565 non-null   float64
 10  WindSpeed      5565 non-null   float64
 11  Month          5565 non-null   int64  
dtypes: float64(10), int64(1), object(1)
memory usage: 521.8+ KB


In [118]:
# dropping ALL duplicate values exceept the last value
new_df = new_df[~new_df.Date.duplicated(keep = 'last')]

In [119]:
# show the number of rows and columns available
new_df.shape

(5565, 12)

In [121]:
# list out the column names
new_df.columns

Index(['Date', 'Temperature', 'CloudOpacity', 'DHI', 'DNI', 'Radiation',
       'Precipitation', 'Humidity', 'Pressure', 'WindDirection', 'WindSpeed',
       'Month'],
      dtype='object')

In [123]:
# save cleaned dataset
new_df.to_csv('/home/josephitopa/Documents/personal_files/ufuoma/data/processed/processed_solar_data.csv', index = False)
new_df.to_csv('/home/josephitopa/Documents/personal_files/ufuoma/data/cleaned_data/cleaned_solar_data.csv', index = False)