In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/raw_data/SolarPrediction.csv')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [3]:
train_data.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
4152,1473879005,9/14/2016 12:00:00 AM,08:50:05,680.04,57,30.45,68,26.7,4.5,06:10:00,18:26:00
13047,1476293121,10/12/2016 12:00:00 AM,07:25:21,277.37,50,30.47,96,144.96,10.12,06:16:00,18:02:00
7420,1477993220,10/31/2016 12:00:00 AM,23:40:20,1.29,47,30.48,56,119.52,3.37,06:23:00,17:49:00
6508,1473013505,9/4/2016 12:00:00 AM,08:25:05,544.75,57,30.47,93,38.61,2.25,06:08:00,18:35:00
29110,1481885434,12/16/2016 12:00:00 AM,00:50:34,1.22,41,30.23,103,177.55,2.25,06:50:00,17:46:00


# Creating pipeline for data preprocessing

Preprocessing classes implemented in preprocessing.py file

In [4]:
from preprocessing import DateTimeTransformer, GroupByAggregator, FilterByDaylight

pipeline = Pipeline([
    ('datetime_transformer', DateTimeTransformer()),
    ('aggregator', GroupByAggregator()),
    ('daylight_filter', FilterByDaylight())
])

In [5]:
train_data = pipeline.fit_transform(train_data)

In [6]:
train_data

Unnamed: 0,Day,Month,Hour,Temperature_mean,Temperature_min,Temperature_max,Temperature_std,Pressure_mean,Pressure_min,Pressure_max,...,WindDirection_max,WindDirection_std,Speed_mean,Speed_min,Speed_max,Speed_std,Radiation_mean,SunDurationMinutes_mean,TimeSunRise_first,TimeSunSet_first
7,1,9,7,50.400000,50,51,0.547723,30.432000,30.42,30.44,...,153.19,49.428734,5.398000,2.25,9.00,2.686740,114.644000,751.0,6.116667,18.633333
8,1,9,8,52.800000,51,54,1.303840,30.450000,30.45,30.45,...,204.79,74.958827,6.748000,3.37,13.50,4.134195,412.658000,751.0,6.116667,18.633333
9,1,9,9,54.875000,54,57,1.125992,30.461250,30.46,30.47,...,181.52,42.221196,7.871250,3.37,12.37,3.558015,743.030000,751.0,6.116667,18.633333
10,1,9,10,57.625000,57,58,0.517549,30.470000,30.47,30.47,...,335.52,95.179420,7.170000,4.50,13.50,3.121058,928.140000,751.0,6.116667,18.633333
11,1,9,11,59.333333,59,60,0.516398,30.461667,30.46,30.47,...,110.80,32.799143,9.558333,3.37,16.87,4.486292,1019.690000,751.0,6.116667,18.633333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2763,31,12,13,53.000000,53,53,0.000000,30.296000,30.29,30.30,...,350.10,14.196042,9.560000,5.62,14.62,3.147768,314.482000,657.0,6.950000,17.900000
2764,31,12,14,52.625000,52,53,0.517549,30.285000,30.28,30.29,...,337.33,10.100144,13.920000,11.25,18.00,2.325541,327.210000,657.0,6.950000,17.900000
2765,31,12,15,53.125000,52,54,0.640870,30.282500,30.28,30.29,...,332.81,10.896043,13.075000,10.12,14.62,1.465898,396.330000,657.0,6.950000,17.900000
2766,31,12,16,52.000000,50,54,1.309307,30.292500,30.28,30.30,...,339.17,9.100989,9.137500,5.62,12.37,1.942596,150.055000,657.0,6.950000,17.900000


In [7]:
test_data = pipeline.transform(test_data)

In [10]:
train_data.to_csv('../data/processed_data/train_data_processed.csv', index=False)
test_data.to_csv('../data/processed_data/test_data_processed.csv', index=False)