In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from pandas import Series
from matplotlib import pyplot

from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.stattools import acf
from statsmodels.tsa.stattools import pacf

from statsmodels.tsa.stattools import adfuller
#from pandas.tools.plotting import autocorrelation_plot
from pandas.plotting import scatter_matrix
from statsmodels.tsa.seasonal import seasonal_decompose

In [3]:
# PATH WHERE THE FILES ARE SAVED
# path = r'/dbfs/FileStore/tables/'
path = './'

# IMPORT OF DATABASE
ddf = pd.read_excel(path + r'/2_cleaned_data/clean_ddf.xlsx')

# IMPORT OF VARIABLES FROM PRE-PROCESSING NOTEBOOK
preprocess1 = (path + r'/3_variables/var1_pre_processing.sav')
num_nan, inv_values = pickle.load(open(preprocess1, 'rb'))

preprocess2 = (path + r'/3_variables/var2_pre_processing.sav')
outlier_df = pickle.load(open(preprocess2, 'rb'))

# IMPORT OF DATA FROM FRAMEWORK CONFIGURATION
frameworkconfig1 = (path + r'/3_variables/var1_framework_config.sav')
ddf_config_var = pickle.load(open(frameworkconfig1, 'rb'))

# frameworkconfig2 = (path + r'/3_variables/var2_framework_config.sav')
# ddf_config_kpi = pickle.load(open(frameworkconfig2, 'rb'))

frameworkconfig3 = (path + r'/3_variables/var3_framework_config.sav')
ddf_config_par = pickle.load(open(frameworkconfig3, 'rb'))

In [4]:
# PRIMARY KEY
primary_key = ddf_config_var.loc[np.where(ddf_config_var["VariableUsage"]=='PRIMARY')[0][0],'VariableName']

# FILTER BY COLUMN
filter_by = ddf_config_par.loc[0,'Value']

# FILTER BY VALUE
filter_value = ddf_config_par.loc[1,'Value']

if str(filter_by)!='nan':
    # PRIMARY KEY LIST WITH FILTER 
    pk_list = ddf[primary_key][ddf[filter_by]==filter_value].unique()
else:
    # PRIMARY KEY LIST 
    pk_list = ddf[primary_key].unique()
    
# DATETIME VARIABLE
date_column = ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='DATETIME')[0][0],'VariableName']

# TARGET VARIABLE
target = ddf_config_var.loc[np.where(ddf_config_var["VariableUsage"]=='TARGET')[0][0],'VariableName']

# EXOGENOUS VARIABLE
exogenous = []
for i in ddf_config_var.loc[np.where(ddf_config_var["VariableUsage"]=='EXOGENOUS')[0],'VariableName']:
    exogenous.append(i)

In [5]:
time_variables = list(ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='DATETIME')[0],'VariableName'])
numeric_variables = list(ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='NUMERIC')[0],'VariableName']) 
categorical_variables = list(ddf_config_var.loc[np.where(ddf_config_var["VariableType"]=='CATEGORICAL')[0],'VariableName'])

# VARIABLES FORMAT
ddf[time_variables] = ddf[time_variables].astype('object')
for col in categorical_variables:
    ddf[col] = ddf[col].astype('category')
ddf[numeric_variables] = ddf[numeric_variables].astype('float')

In [None]:
# MEAN, STANDARD DEVIATION, MINIMUM, P25, P50, P75 AND MAXIMUM
data_stats_post = pd.DataFrame.describe(ddf).transpose()
data_stats_post

In [None]:
# MEAN, STANDARD DEVIATION, MINIMUM, MAXIMUM AND COEFFICIENT OF VARIATION DISAGGREGATED BY PRIMARY KEY
data_stats_by = ddf.groupby(primary_key).agg(['mean','std','min','max',lambda x: round(x.std()/abs(x.mean())*100,2) if abs(x.mean())!=0 else float('Inf')])
data_stats_by

In [None]:
# TABLE WITH PERCENTAGE OF MISSING VALUES, INVALID VALUES AND OUTLIERS
# PERCENTAGE OF MISSING VALUES
data_cleaning = pd.DataFrame(round(num_nan/len(ddf)*100,2))
data_cleaning.columns =['%nan']

# PERCENTAGE OF INVALID VALUES
inv_values['invalid_values'] = inv_values['invalid_values'].astype('float')

for i in numeric_variables:
    j = i
    j = inv_values['invalid_values'][inv_values['numeric_variable']==j]
    j = j.values
    data_cleaning.loc[i,'invalid_values'] = round(j[0]/len(ddf)*100,2)

inv_perc = pd.DataFrame(data_cleaning['invalid_values'])

# PERCENTAGE OF OUTLIERS
for i in numeric_variables:
    j = 'outliers_' + i[:3]
    j = outlier_df['outlier_number'][outlier_df['outlier_name']==j]
    j = j.values
    data_cleaning.loc[i,'outliers'] = round(j[0]/len(ddf)*100,2)

outlier_perc = pd.DataFrame(data_cleaning['outliers'])
data_cleaning

In [9]:
pk_value = ddf[primary_key].unique()[0]

In [10]:
# TIME SERIES OF TARGET OF PRIMARY KEY VALUE
target_p1 = []
date_p1 = []

# for pro in products:
target_p1.append(ddf[target][ddf[primary_key]==pk_value])
date_p1.append(ddf[time_variables][ddf[primary_key]==pk_value])

In [None]:
# PLOT OF TIME SERIES
# for i in pk_list:    
plt.plot(ddf[target][ddf[primary_key]==pk_value])
plt.title('Serie de Tiempo de ' + target + ' de la Primary Key ' + primary_key + ': ' + str(pk_value))
plt.ylabel(target)
plt.xlabel('Time')
display()

In [None]:
# PLOT OF AUTOCORRELATION FUNCTION
plot_acf(ddf[target][ddf[primary_key]==pk_value], lags= 7, alpha=0.05)
# print(acf(ddf[target][ddf[primary_key]==pk_value]))
display()

In [None]:
# PLOT OF PARTIAL AUTOCORRELATION FUNCTION
# plot_pacf(ddf[target][ddf[primary_key]==pk_value],method='ywu')
plot_pacf(ddf[target][ddf[primary_key]==pk_value],method='ols')
# print(pacf(ddf[target][ddf[primary_key]==pk_value]))
display()

In [None]:
# AUGMENTED DICKEY-FULLER TEST (STATIONARITY TEST: IF THE NULL HYPOTHESIS IS REJECTED, THEN THE SERIES IS STATIONARY)
ADF = adfuller(ddf[target][ddf[primary_key]==pk_value].values)
print('ADF Statistic: %f' % ADF[0])
print('p-value: %f' % ADF[1])
print('Critical Values:')
for key, value in ADF[4].items(): 
    print('\t%s: %.3f' % (key, value))

In [None]:
# BREUSCH-PAGAN TEST (SEASONALITY TEST)
seasonal_test = seasonal_decompose(tuple(ddf[ddf[primary_key]==pk_value][target]), model='Additive', freq=6)
seasonal_test.seasonal
seasonal_test.plot()
display()