In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn.preprocessing import MinMaxScaler
import pickle
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.ndimage import gaussian_filter1d

In [2]:
%run utils.ipynb

In [3]:
def preprocessing(df,
                  columns_to_remove = None,
                  ratio = False,
                  smoothing = False,
                  collinearity = False, 
                  log = False,
                  aggregation = 0,
                  statistics = False,
                  differencing = 0, 
                  seasonality = False, 
                  normalize = True,
                  load_scaler = False,
                  save = True,
                  continual = False):
    
    if columns_to_remove is None:
        columns_to_remove = []
    LENGTH = len(df)

    # Processed dataset path
    df_path = ('df_' + str(len(columns_to_remove)) + '_' + str(ratio) + '_' + str(smoothing) + '_' + str(collinearity) + '_' + str(log) + '_' + str(aggregation) + '_' + str(statistics) + '_' + str(differencing) + '_' + str(seasonality) + '_' + str(normalize))  

    # Remove columns not used in training
    if columns_to_remove:
        for column in columns_to_remove:
            if column in df.columns:
                df = df.drop(columns = column)

    # Analyze collinearity to remove useless features
    if collinearity:
        correlation_matrix = df.corr()
        plt.figure(figsize = (12, 10))
        sns.heatmap(correlation_matrix, annot = True, fmt = ".2f", cmap = 'coolwarm', square = True, linewidths = .5, cbar_kws = {"shrink": .5})
        plt.savefig('features_corr_matrix.png', dpi = 300)
        plt.close()
        features = df.columns
        vif_data = pd.DataFrame()
        vif_data["Feature"] = df.columns
        vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(features))]
        # Remove features with VIF > 10 from df
        features_to_remove = vif_data[vif_data['VIF'] > 10]['Feature']
        df = df.drop(columns = features_to_remove)
        
    # Compute ratio between rides and requested
    if ratio:
        subset = ['rides_canceled', 'rides_accepted', 'rides_rejections', 'rides_completed']
        for column in subset:
            df[column] = df[column].astype(float)
            for i in range(0, len(df[column])):
                if df.loc[i, 'rides_requested'] != 0:
                    df.loc[i, column] = df.loc[i, column]/df.loc[i, 'rides_requested']
                else:
                    df.loc[i, column] = 0
                     
    # Compute log transformation
    if log:
        subset = ['rides_canceled', 'rides_accepted', 'rides_rejections', 'rides_requested', 'rides_completed']
        for column in subset:
            if column in df.columns:
                df[column] = df[column].astype(float)
                for i in range(0, len(df)):
                    if df.loc[i, column] != 0:
                        df.loc[i, column] = np.log(df.loc[i, column])
            
    df = df.dropna().reset_index(drop = True)
                    
    # Pay attention to seasonalities
    if seasonality:
        for column in df.columns:
            if column == 'avg_current_error_ride_distance':
                df[column] = df[column].diff(periods = 90)
            elif column == 'avg_speed_max_speed':
                df[column] = df[column].diff(periods = 90)
            elif column == 'avg_speed_kmh':
                df[column] = df[column].diff(periods = 90)
            elif column == 'avg_remaining_distance_covered':
                df[column] = df[column].diff(periods = 90)
            elif column == 'avg_surge_multiplier':
                df[column] = df[column].diff(periods = 300)
            elif column == 'moving_drivers':
                df[column] = df[column].diff(periods = 900)
            
    df = df.dropna().reset_index(drop = True)

    # Aggregate consecutive observations by mean (useful for smoothing, reducing outliers and noise)
    if aggregation != 0 and statistics == False:
        for column in df.columns:
            if column not in ['avg_ratio_cust_driv', 'avg_dynamic_greediness']:
                df[column] = np.around(df[column].astype(float), 5)
                df[column] = df[column].rolling(window = aggregation).mean()
        df = df[aggregation:].reset_index(drop = True)
    
    # Compute additional statistics for each feature
    if aggregation != 0 and statistics == True:
        for column in df.columns:
            df[column] = np.around(df[column].astype(float), 5)
        df_ = copy.deepcopy(df)
        for column in df_.columns:
            df[column] = df[column].rolling(window = aggregation).mean()
            new_cols = []
            new_cols.append(pd.Series(df_[column].rolling(window = aggregation).median(), name = column + '_median'))
            new_cols.append(pd.Series(df_[column].rolling(window = aggregation).std(), name = column + '_std'))
            new_cols.append(pd.Series(df_[column].rolling(window = aggregation).quantile(0.25), name = column + '_q25'))
            new_cols.append(pd.Series(df_[column].rolling(window = aggregation).quantile(0.75), name = column + '_q75'))
            new_cols.append(pd.Series(np.around(df_[column], 4).rolling(window = aggregation).apply(calc_kurtosis, raw = True), name = column + '_kurtosis'))
            new_cols.append(pd.Series(np.around(df_[column], 4).rolling(window = aggregation).apply(calc_skewness, raw = True), name = column + '_skewness'))
            df = pd.concat([df] + new_cols, axis = 1)
        df = df[aggregation:].reset_index(drop = True)
            
    df = df.dropna().reset_index(drop = True)
        
    # Differentiate the time series
    if differencing != 0:
        if not seasonality:
            for column in df.columns:
                df[column] = df[column].diff(periods = differencing)
        else:
            for column in df.columns:
                if column not in ['avg_surge_multiplier', 'moving_drivers', 'avg_current_error_ride_distance', 'avg_speed_max_speed', 'avg_speed_kmh', 'avg_remaining_distance_covered']:
                    df[column] = df[column].diff(periods = differencing)
    
    df = df.dropna().reset_index(drop = True)
         
    # Smooth noisy columns           
    if smoothing:
        subset = ['rides_canceled', 'rides_accepted', 'rides_rejections', 'rides_requested', 'rides_completed']
        for column in subset:
            df[column] = gaussian_filter1d(df[column], sigma = 2)

    df = df.dropna().reset_index(drop = True)
    
    # Removing stabilization time and final time
    removed = LENGTH - len(df)
    df = df[5400-removed:-1800]
    
    # Normalize the values of each time series between 0 and 1
    if normalize:
        if load_scaler:
            if continual:
                with open('scalers/continual_' + df_path + '.pkl', 'rb') as file:
                    scaler = pickle.load(file)
            else:
                with open('scalers/' + df_path + '.pkl', 'rb') as file:
                    scaler = pickle.load(file)
            df = normalization(scaler, df) 
        else:
            scaler = MinMaxScaler()
            scaler.fit(df.values)
            df = normalization(scaler, df)
            if continual:
                with open('scalers/continual_' + df_path + '.pkl', 'wb') as file:
                    pickle.dump(scaler, file)
            else:
                with open('scalers/' + df_path + '.pkl', 'wb') as file:
                    pickle.dump(scaler, file)
        
    # Save specific dataset and return
    if save:
        if continual:
            df.to_csv('datasets/proc/continual/' + df_path + '.csv', index = False, header = True)
        else:
            df.to_csv('datasets/proc/' + df_path + '.csv', index = False, header = True)
    if normalize:
        return df, scaler
    else:
        return df