In [1]:
import time
import warnings
from math import sqrt

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split

# warnings.filterwarnings(action="ignore", message="Mean of empty slice")
warnings.filterwarnings("ignore")

matplotlib.style.use("ggplot")
pd.set_option("display.max_rows", 30)
pd.options.mode.chained_assignment = None

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('../src')

from data_preprocessing import split, label_encode

In [5]:
X_train = pd.read_csv('../data/X_train.csv', parse_dates=['reportts'])
y_train = pd.read_csv('../data/y_train.csv', parse_dates=['reportts'])

# X_test = pd.read_csv('../data/X_test.csv', parse_dates=['reportts'])

X_train.rename(columns={"reportts": "datetime"}, inplace=True);
# X_test.rename(columns={"reportts": "datetime"}, inplace=True);
y_train.rename(columns={"reportts": "datetime"}, inplace=True);

dataset = X_train.merge(y_train, on=['acnum', 'pos', 'datetime']).dropna(subset=['egtm'])
dataset_time_sorted = dataset.sort_values(by='datetime').reset_index(drop=True)

In [6]:
pipeline_BGU = Pipeline([
    ('drop_nan_columns', FunctionTransformer(lambda X: X.dropna(axis=1, how='all'))),
#     ('custom_feature_engineering', FunctionTransformer(custom_feature_engineering)),
    ('filter_data', FunctionTransformer(lambda X: X[X['acnum'] == 'VQ-BGU'].reset_index(drop=True))),
    ('label_encode', FunctionTransformer(label_encode)),
    ('train_test_split', FunctionTransformer(split))
])


In [7]:
pipeline_BDU = Pipeline([
    ('drop_nan_columns', FunctionTransformer(lambda X: X.dropna(axis=1, how='all'))),
#     ('custom_feature_engineering', FunctionTransformer(custom_feature_engineering)),
    ('filter_data', FunctionTransformer(lambda X: X[X['acnum'] == 'VQ-BDU'].reset_index(drop=True))),
    ('label_encode', FunctionTransformer(label_encode)),
    ('train_test_split', FunctionTransformer(split))
])

In [8]:
X_train_BGU, X_val_BGU, y_train_BGU, y_val_BGU = pipeline_BGU.fit_transform(dataset)

# Fill NaNs after train-test split
# imputer = SimpleImputer(strategy='constant', fill_value=0)
# X_train = imputer.fit_transform(X_train)
# X_test = imputer.transform(X_test)


In [9]:
X_train_BDU, X_val_BDU, y_train_BDU, y_val_BDU = pipeline_BDU.fit_transform(dataset)

In [11]:
# Save X_train and X_val to CSV files
X_train_BGU.to_csv('../tmp_data/X_train_BGU.csv', index=False)
X_val_BGU.to_csv('../tmp_data/X_val_BGU.csv', index=False)

# Save y_train and y_val to CSV files
y_train_BGU.to_csv('../tmp_data/y_train_BGU.csv', index=False, header=True)
y_val_BGU.to_csv('../tmp_data/y_val_BGU.csv', index=False, header=True)