In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from src.base.column_name import RentDataCN
from src.base.regression_model_base import RegressionModelBase
from src.repository.rent_data_loader import RentDataLoader
from src.repository.weather_data_loader import WeatherDataLoader
from src.transform.sampling.random_sampling import RandomSampling
from src.transform.transformer.column_renamer import ColumnRenamer
from src.transform.transformer.data_concater import DataConcater
from src.transform.transformer.datetime_to_category import DatetimeToCategory
from src.transform.transformer.location_column_extender import LocationColumnExtender
from src.transform.transformer.weather_column_extender import WeatherColumnExtender
from src.transform.transformer.simple_datetime_aggregator import SimpleDatetimeAggregator
from src.transform.transformer.string_to_datetime_converter import StringToDatetimeConverter
from src.transform.transformer.weather_data_preprocessor import WeatherDataPreprocessor

In [2]:
data_loader = RentDataLoader()
weather_data_loader = WeatherDataLoader()

weather_pipline = Pipeline([
    ('data_concatenate', DataConcater(data_category='weather')),
    ('renamer', ColumnRenamer()),
    ('str2datetime', StringToDatetimeConverter(data_category='weather')),
    ('preprocessor', WeatherDataPreprocessor())
])

weather_data = weather_pipline.fit_transform(weather_data_loader.all_data)

pipline = Pipeline([
    ('data_concatenate', DataConcater()),
    ('renamer', ColumnRenamer()),
    ('str2datatime', StringToDatetimeConverter()),
    ('location_extender', LocationColumnExtender(year="2021", only_rent_location=True)),
    ('aggregate', SimpleDatetimeAggregator()),
    ('weather_extender', WeatherColumnExtender(preprocessed_data=weather_data)),
    ('datetime2category', DatetimeToCategory()),
])

In [3]:
cat_attribs = ['month', 'hour', 'weekday']

processed_data = pipline.fit_transform(data_loader.all_data)

# print(self.__processed_data.columns)

for column in cat_attribs:
    processed_data = pd.concat([processed_data, pd.get_dummies(processed_data[column])], axis=1)

processed_data.drop(cat_attribs, axis=1, inplace=True)
processed_data.columns = processed_data.columns.astype(str)

In [14]:
processed_data

Unnamed: 0,rent_station,rent_count,temperature,precipitation,sunshine_duration,day,1,2,3,4,...,21,22,23,0,1.1,2.1,3.1,4.1,5,6
0,46.0,1,-3.6,0.0,0.0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,152.0,1,-3.6,0.0,0.0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,133.0,2,-3.6,0.0,0.0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,39.0,1,-3.6,0.0,0.0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,92.0,2,-3.6,0.0,0.0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2259985,124.0,1,25.1,0.0,0.1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2259986,11.0,1,25.1,0.0,0.1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2259987,145.0,1,24.4,0.0,0.0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2259988,168.0,1,23.8,0.0,0.0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
