In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/SolarPrediction.csv')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [3]:
train_data.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
4152,1473879005,9/14/2016 12:00:00 AM,08:50:05,680.04,57,30.45,68,26.7,4.5,06:10:00,18:26:00
13047,1476293121,10/12/2016 12:00:00 AM,07:25:21,277.37,50,30.47,96,144.96,10.12,06:16:00,18:02:00
7420,1477993220,10/31/2016 12:00:00 AM,23:40:20,1.29,47,30.48,56,119.52,3.37,06:23:00,17:49:00
6508,1473013505,9/4/2016 12:00:00 AM,08:25:05,544.75,57,30.47,93,38.61,2.25,06:08:00,18:35:00
29110,1481885434,12/16/2016 12:00:00 AM,00:50:34,1.22,41,30.23,103,177.55,2.25,06:50:00,17:46:00


# Creating pipeline for data preprocessing

Preprocessing classes implemented in preprocessing.py file

In [4]:
from preprocessing import DateTimeTransformer, GroupByAggregator

pipeline = Pipeline([
    ('datetime_transformer', DateTimeTransformer()),
    ('aggregator', GroupByAggregator())
])

In [5]:
train_data = pipeline.fit_transform(train_data)

In [6]:
train_data

Unnamed: 0,Day,Month,Hour,Temperature_mean,Temperature_min,Temperature_max,Temperature_std,Pressure_mean,Pressure_min,Pressure_max,...,WindDirection_max,WindDirection_std,Speed_mean,Speed_min,Speed_max,Speed_std,Radiation_mean,SunDurationMinutes_mean,TimeSunRise_first,TimeSunSet_first
0,1,9,0,51.142857,51,52,0.377964,30.430000,30.43,30.43,...,156.58,43.444683,8.998571,1.12,18.00,5.396701,2.307143,751.0,6.116667,18.633333
1,1,9,1,51.666667,51,52,0.500000,30.416667,30.41,30.43,...,270.48,72.718703,5.371111,3.37,10.12,2.303565,2.922222,751.0,6.116667,18.633333
2,1,9,2,51.000000,51,51,0.000000,30.404000,30.40,30.41,...,166.75,22.371790,9.222000,2.25,14.62,4.267494,2.722000,751.0,6.116667,18.633333
3,1,9,3,50.900000,50,51,0.316228,30.400000,30.40,30.40,...,168.14,30.789741,5.173000,1.12,9.00,2.499351,2.367000,751.0,6.116667,18.633333
4,1,9,4,49.090909,49,50,0.301511,30.407273,30.40,30.41,...,150.52,25.455849,8.588182,2.25,16.87,3.772908,2.601818,751.0,6.116667,18.633333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769,31,12,19,46.111111,46,47,0.333333,30.327778,30.32,30.33,...,299.28,9.141183,6.372222,4.50,7.87,1.124586,1.218889,657.0,6.950000,17.900000
2770,31,12,20,44.400000,43,46,1.173788,30.337000,30.33,30.34,...,270.00,25.939323,5.173000,2.25,9.00,2.499351,1.219000,657.0,6.950000,17.900000
2771,31,12,21,42.000000,41,43,0.707107,30.344444,30.34,30.35,...,224.93,6.325224,7.747778,5.62,9.00,1.429710,1.225556,657.0,6.950000,17.900000
2772,31,12,22,40.875000,40,41,0.353553,30.346250,30.34,30.35,...,226.35,7.124417,8.576250,6.75,10.12,1.030547,1.207500,657.0,6.950000,17.900000


In [7]:
test_data = pipeline.transform(test_data)

In [9]:
train_data.to_csv('data/train_data_processed.csv', index=False)
test_data.to_csv('data/test_data_processed.csv', index=False)