In [1]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_log_error, r2_score, make_scorer
from sklearn.model_selection import learning_curve
from scipy.stats import norm
from pmdarima import auto_arima
from statsmodels.graphics.tsaplots import plot_acf
from fbprophet import Prophet
from fbprophet.plot import add_changepoints_to_plot
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics

In [2]:
def get_data():
    train = pd.read_csv('training.csv', parse_dates=['transaction_date'])
    card_group = pd.read_csv('mcc_group_definition.csv')
    transaction = pd.read_csv('transaction_types.csv')
    
    train = train.drop(['dataset_transaction','dataset_user'], axis=1)
    # merge data
    train_final = train.merge(card_group, on='mcc_group', how='left').drop('mcc_group', axis=1)

    train_final = (train_final
          .merge(transaction, left_on='transaction_type', right_on='type', suffixes=('_card', '_transaction'))
          .drop(['type','transaction_type'], axis=1))

    train_final.amount_n26_currency = np.log1p(train_final.amount_n26_currency)
    # segregating out data
    in_data, out_data= train_final[(mask:=train_final['direction'] == "In")], train_final[~mask]
    out_data = out_data.drop(['direction','explanation_card','explanation_transaction','agent'], axis=1)
    out_data = out_data.sort_values(['user_id','transaction_date']).reset_index(drop=True)

    return out_data, train_final

In [3]:
# aggregate the last 2 functions

def plot_bar(feature,data):
    g = sns.catplot(x=feature, 
                    data=data,
                    order = data[feature].value_counts().index,
                    kind='count', 
                    height=7, 
                    aspect = 3, 
                    palette="Paired", 
                    edgecolor=".6")
    g.set_xticklabels(rotation=90)

def weekly_trend(feature):
    f = train_final.groupby([feature, pd.Grouper(key='transaction_date',freq='W')]).size().reset_index()
    f['transaction_date'] = f['transaction_date'].dt.date
    f.columns = [feature,'transaction_date','count']
    sns.catplot(x="transaction_date", 
            y = 'count',
            hue = feature, 
            data=f, 
            kind='point', 
            height=7, 
            aspect= 1,
            palette='Paired')
    _ = plt.xticks(rotation=90)
    
    
def currency_trend(feature):
    f = train_final.groupby([*feature, pd.Grouper(key='transaction_date',freq='W')])['amount_n26_currency'].mean().reset_index()
    f_std = train_final.groupby([*feature, pd.Grouper(key='transaction_date',freq='W')])['amount_n26_currency'].std().reset_index()
    f['transaction_date'] = f['transaction_date'].dt.date
    
    f_std['transaction_date'] = f_std['transaction_date'].dt.date
    f.columns = [*feature,'transaction_date','mean']
    f_std.columns = [*feature,'transaction_date','std']
    
    f_final = f.merge(f_std)
    hue = feature[0]
    
    # correlating features or maybe not! 
    if feature[1]:
        g = sns.catplot(x="transaction_date", 
                y = 'mean',
                hue = feature[0],
                col = feature[1],
                data=f, 
                kind='point', 
                height = 7,
                aspect = 1.5,
                palette=sns.color_palette("bright", 10)
                )
        g.set_xticklabels(rotation=90)
    else:
        g = sns.catplot(x="transaction_date", 
        y = 'mean',
        hue = feature[0],
        col = feature[1],
        data=f, 
        kind='point', 
        height = 7,
        aspect = 1.5,
        palette=sns.color_palette("bright", 10)
        )
        g.set_xticklabels(rotation=90)
        autocorrelation_plot(f['mean'])
#     x = np.arange(len(f['mean']))
#   plt.errorbar(x,f_final['mean'],yerr=f_final['std'])


## The data of primary interest which is the amount_n26_currecy is clearly postively sckwed

In [4]:
class feature_engineering_datetime(TransformerMixin, BaseEstimator):
    

    def fit(self, data, label=None):
        return self

    def transform(self, data, label = None):
        data[f"{'transaction'}_day_week"] = data['transaction_date'].dt.dayofweek
        data[f"{'transaction'}_week"] = data['transaction_date'].dt.isocalendar().week
        data[f"{'transaction'}_week_day"] = data['transaction_date'].dt.day
        self.names = data[[x for x in data.columns if '_week' in x]].columns.tolist()
       
        return data[(elem for elem in data.columns if '_week' in elem)].values
    def feature_names(self):
        return self.names

In [5]:
# X = train2.values
# history = [X[i] for i in range(10)]
# yhat = np.mean([history[i] for i in range(10)])


In [6]:
class feature_engineering(TransformerMixin, BaseEstimator):
    

    def fit(self, data, label=None):
        return self

    def transform(self, data, label = None):
        #lags= [1,5,7,14]
        lags = [7]
        for lag in lags:
            print('in timestamp of lags')
        # lag feature
        #
            data[f'prev_amount_mean_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).fillna(0)
#             # rolling average
            # grouping by the user id and then calculating the mean. Makes sense
            data[f'prev_amount_rolling_mean_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).rolling(2).mean().fillna(0)
#             # cumulative average
            data[f'prev_amount_expand_mean_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).expanding().mean().fillna(0)
#             # exponential moving average
            data[f'prev_amount_ewm_mean_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).ewm(0.1).mean().fillna(0)

            data[f'prev_amount_rolling_std_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).rolling(2).std().fillna(0)
#             # cumulative average
            data[f'prev_amount_expand_std_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).expanding().std().fillna(0)
#             # exponential moving average
            data[f'prev_amount_ewm_std_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).ewm(0.1).std().fillna(0)
         
            ##
#             # rolling average
            # grouping by the user id and then calculating the mean. Makes sense
            data[f'prev_amount_rolling_min_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).rolling(2).min().fillna(0)
#             # cumulative average
            data[f'prev_amount_expand_min_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).expanding().min().fillna(0)
#             # exponential moving average
#             data[f'prev_amount_ewm_min_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).ewm(0.1).min().fillna(0)
         
        
            # rolling average
            # grouping by the user id and then calculating the mean. Makes sense
            data[f'prev_amount_rolling_max_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).rolling(2).max().fillna(0)
#             # cumulative average
            data[f'prev_amount_expand_max_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).expanding().max().fillna(0)
   
        
        self.names = data[[x for x in data.columns if 'prev_' in x]].columns.tolist()

    
        return data[[x for x in data.columns if 'prev_' in x]].values
        

    def feature_names(self):
        return self.names
    

In [7]:
# Feature transformation pipeline
# It combines all sorts of features that are transformed and combine them
# to generate an output
def generate_pipeline(**kwargs):
    

    features = ColumnTransformer(
        [('date_time_features',feature_engineering_datetime(), ['transaction_date']),
         ('lag_features',feature_engineering(), ['user_id','amount_n26_currency'])
        ],  
        remainder='drop',
        sparse_threshold=0.3,
        n_jobs=None,
        transformer_weights=None
    )      
  
    feature_transformer = Pipeline([
        ('features', features),
    ]) 
    return feature_transformer

## we can deal with this by log transforming the series


In [8]:
# # log_amont = np.log(train2.amount_n26_currency)
# train2.amount_n26_currency = np.log(train2.amount_n26_currency)
# # test the skew
# # log_amont.skew() 
# plt.hist(train2.amount_n26_currency, bins=100,rwidth=0.8,density=True)
# # plt.show()
# rng = np.arange(train2.amount_n26_currency.min(), train2.amount_n26_currency.max(), 0.1)
# plt.plot(rng,norm.pdf(rng,train2.amount_n26_currency.mean(),train2.amount_n26_currency.std()))
# plt.axvline(train2.amount_n26_currency.mean(),ls=':',lw=2, color='b', label= 'mean = '+str(train2.amount_n26_currency.mean()))
# plt.xlabel('amount_n26_currency')
# plt.ylabel('')
# plt.legend()
# plt.show()

## The log transformation has clearly showed some progress in terms of the data 

In [9]:
## add code for outlier detection

# observing weekly trends


In [14]:
# aggregate the last 2 functions

def plot_bar(feature,data):
    g = sns.catplot(x=feature, 
                    data=data,
                    order = data[feature].value_counts().index,
                    kind='count', 
                    height=7, 
                    aspect = 3, 
                    palette="Paired", 
                    edgecolor=".6")
    g.set_xticklabels(rotation=90)

def weekly_popularity_trend(feature):
    """
    Feature popularity on weekly basis
    """
    f = train_final.groupby([feature, pd.Grouper(key='transaction_date',freq='W')]).size().reset_index()
    f['transaction_date'] = f['transaction_date'].dt.date
    f.columns = [feature,'transaction_date','count']
    sns.catplot(x="transaction_date", 
            y = 'count',
            hue = feature, 
            data=f, 
            kind='point', 
            height=7, 
            aspect= 1,
            palette='Paired')
    _ = plt.xticks(rotation=90)
    
    
def weekly_currency_trend(feature):
    """
    Mean transaction amount on a weekly basis for a given feature
    """
#    weekly spending trend to get an idea about 
    f = train_final.groupby([*feature, pd.Grouper(key='transaction_date',freq='W')])['amount_n26_currency'].mean().reset_index()
    f_std = train_final.groupby([*feature, pd.Grouper(key='transaction_date',freq='W')])['amount_n26_currency'].std().reset_index()
    f['transaction_date'] = f['transaction_date'].dt.date
    
    f_std['transaction_date'] = f_std['transaction_date'].dt.date
    f.columns = [*feature,'transaction_date','mean']
    f_std.columns = [*feature,'transaction_date','std']
    
    f_final = f.merge(f_std)
    hue = feature[0]
    
    # correlating features or maybe not! 
    if feature[1]:
        g = sns.catplot(x="transaction_date", 
                y = 'mean',
                hue = feature[0],
                col = feature[1],
                data=f, 
                kind='point', 
                height = 7,
                aspect = 1.5,
                palette=sns.color_palette("bright", 10)
                )
        g.set_xticklabels(rotation=90)
    else:
        g = sns.catplot(x="transaction_date", 
        y = 'mean',
        hue = feature[0],
        col = feature[1],
        data=f, 
        kind='point', 
        height = 7,
        aspect = 1.5,
        palette=sns.color_palette("bright", 10)
        )
        g.set_xticklabels(rotation=90)
        autocorrelation_plot(f['mean'])
#     x = np.arange(len(f['mean']))
#   plt.errorbar(x,f_final['mean'],yerr=f_final['std'])


# At the end of each month major income occures as should be

## Feature Engineering date

In [16]:
class feature_engineering_datetime(TransformerMixin, BaseEstimator):
    

    def fit(self, data, label=None):
        return self

    def transform(self, data, label = None):
        data[f"{'transaction'}_weekday"] = data['transaction_date'].dt.dayofweek
        data[f"{'transaction'}_week"] = data['transaction_date'].dt.isocalendar().week
        data[f"{'transaction'}_day_week"] = data['transaction_date'].dt.day
#         data[f"{'transaction'}_week_year"] = data['transaction_date'].dt.year
        self.names = data[[x for x in data.columns if 'prev_' in x]].columns.tolist()
       
        return data[(elem for elem in data.columns if '_week' in elem)].values
    def feature_names(self):
        return self.names

In [17]:
class feature_engineering(TransformerMixin, BaseEstimator):
    

    def fit(self, data, label=None):
        return self

    def transform(self, data, label = None):
        #lags= [1,5,7,14]
        lags = [7]
        for lag in lags:
            print('in timestamp of lags')
        # lag feature
        #A moving average is commonly used with time series data to smooth out short-term fluctuations and highlight longer-term trends or cycles.
            data[f'prev_amount_mean_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).fillna(0)
#             # rolling average
            # grouping by the user id and then calculating the mean. Makes sense
            data[f'prev_amount_rolling_mean_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).rolling(2).mean().fillna(0)
#             # cumulative average
            data[f'prev_amount_expand_mean_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).expanding().mean().fillna(0)
#             # exponential moving average
            data[f'prev_amount_ewm_mean_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).ewm(0.1).mean().fillna(0)
         
#             # rolling average
            # grouping by the user id and then calculating the mean. Makes sense
            data[f'prev_amount_rolling_std_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).rolling(2).std().fillna(0)
#             # cumulative average
            data[f'prev_amount_expand_std_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).expanding().std().fillna(0)
#             # exponential moving average
            data[f'prev_amount_ewm_std_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).ewm(0.1).std().fillna(0)
         
            ##
#             # rolling average
            # grouping by the user id and then calculating the mean. Makes sense
            data[f'prev_amount_rolling_min_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).rolling(2).min().fillna(0)
#             # cumulative average
            data[f'prev_amount_expand_min_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).expanding().min().fillna(0)
#             # exponential moving average
#             data[f'prev_amount_ewm_min_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).ewm(0.1).min().fillna(0)
         
        
        #             # rolling average
            # grouping by the user id and then calculating the mean. Makes sense
            data[f'prev_amount_rolling_max_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).rolling(2).max().fillna(0)
#             # cumulative average
            data[f'prev_amount_expand_max_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).expanding().max().fillna(0)
#             # exponential moving average
#             data[f'prev_amount_ewm_max_{lag}'] = data.groupby(['user_id'])['amount_n26_currency'].shift(lag).ewm(0.1).max().fillna(0)
         
        
        self.names = data[[x for x in data.columns if 'prev_' in x]].columns.tolist()
#         if label:
#             y = df[label]
#             return data[[x for x in data.columns if 'prev_' in x]].values, y.values
    
        return data[[x for x in data.columns if 'prev_' in x]].values
        

    def feature_names(self):
        return self.names
    

## pipeline creation for the model


In [18]:
# Feature transformation pipeline
# It combines all sorts of features that are transformed and combine them
# to generate an output
def generate_pipeline(**kwargs):
    

    features = ColumnTransformer(
        [('date_time_features',feature_engineering_datetime(), ['transaction_date']),
         ('lag_features',feature_engineering(), ['user_id','amount_n26_currency'])
        ],  
        remainder='drop',
        sparse_threshold=0.3,
        n_jobs=None,
        transformer_weights=None
    )

    
#     # can use any sklearn estimator
#     clf = XGBRegressor(n_estimators=100, random_state=42, booster='gbtree')
        
  
    feature_transformer = Pipeline([
        ('features', features),
        # ('clf', clf)
    ]) 
    return feature_transformer

In [19]:
def evaluation_metric(testy,predictions):
    
    # mean squared error
    mse = mean_squared_error(testy, predictions)
    print('mean squared error is: %f' %mse)
    # mean forecast error
    forecast_errors = [testy[i]-predictions[i] for i in range(len(testy))]
    bias = sum(forecast_errors) * 1.0/len(testy)
    print('Bias: %f' % bias)
    
    predictions_dataframe = pd.DataFrame({'y_true': np.exp(testy), 'y_pred': np.exp(predictions)})
    plt.figure(figsize=(18,15))
    plt.scatter(range(predictions_dataframe.shape[0]), predictions_dataframe['y_pred'], label='predicted', linestyle='dashed')
    plt.scatter(range(predictions_dataframe.shape[0]), predictions_dataframe['y_true'], label='True', linestyle='dashed',alpha=0.3)
    plt.ylabel('amount n26 currency')
    plt.legend()
    plt.savefig('expense_prediction.png')
    plt.show()

In [56]:
def arima_forecast(train):
    model_arima = sm.tsa.arima.ARIMA(train, order=(1,1,2))
    model_fit = model_arima.fit()
    yhat = model_fit.forecast()

    return yhat,model_arima

In [64]:
def single_model(out_data,month):
    train = out_data[out_data['transaction_date'].dt.month < month].reset_index(drop=True) # from feb to june
    test = out_data[out_data['transaction_date'].dt.month == month].reset_index(drop=True) # test july

    trainy = train['amount_n26_currency']
    testy = test['amount_n26_currency']
    
    history = [x for x in train['amount_n26_currency']]
    predictions,model_arima = arima_forecast(history)

    return predictions,testy

In [65]:
out_data, train_final = get_data()

In [66]:
predictions,testy = single_model(out_data,month=7)



In [68]:
predictions

array([2.66776352])

In [None]:
evaluation_metric(testy,predictions)

In [77]:
def arima_model(out_data):
   
    predictions = list()
    # train till june and test for july
    train = out_data[out_data['transaction_date'].dt.month < 7].reset_index(drop=True) # from feb to june
    test = out_data[out_data['transaction_date'].dt.month == 7].reset_index(drop=True) # test july
    history = [x for x in train['amount_n26_currency']]
    for t in range(len(test)):
        model = sm.tsa.arima.ARIMA(history, order=(1,1,2))
        model_fit = model.fit()
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = test['amount_n26_currency'][t]
        history.append(obs)
        print('predicted=%f, expected=%f' % (yhat, obs))
        
    rmse = np.sqrt(mean_squared_error(test, predictions))
    print('Test RMSE: %.3f' % rmse)
    # plot forecasts against actual outcomes
    pyplot.plot(test)
    pyplot.plot(predictions, color='red')
    pyplot.show()
    return predictions
#     print(model_fit.summary())
#     residuals = pd.DataFrame(model_fit.resid)
#     plt.figure(figsize=(15,15))
#     fig, ax = plt.subplots(1,2)
#     residuals.plot(title="Residuals", ax=ax[0])
#     residuals.plot(kind='kde', title='Density', ax=ax[1])
#     forecast_arima = model_fit.forecast()
#     plt.show()
#     return forecast_arima



In [None]:
preds = arima_model(out_data)



predicted=2.667764, expected=2.302585




predicted=2.640285, expected=2.079442
predicted=2.589339, expected=2.772589




predicted=2.627002, expected=4.442651


In [17]:
def walk_forward_validation(out_data):
    predictions = list()
    
    # change to 8 for august!
    train = out_data[out_data['transaction_date'].dt.month <= 7].reset_index(drop=True) # from feb to june
    test = out_data[out_data['transaction_date'].dt.month == 7].reset_index(drop=True) # test july
    model = generate_pipeline()
    model.fit(train)
    # should the data be transformed after each loop?
    
    history = [x for x in train['amount_n26_currency']]
    trainY = np.log1p(train['amount_n26_currency'])
    testY = np.log1p(test['amount_n26_currency'])
    history = transformed_train
    for i in range(len(transformed_test)):
        model = sm.tsa.arima.ARIMA(history, order=(1,1,2))
        model_fit = model.fit()
        output = model_fit.forecast()
        yhat = output[0]
        predictions.append(yhat)
        obs = test['amount_n26_currency'][t]
        history.append(obs)
        print('predicted=%f, expected=%f' % (yhat, obs))
        
    rmse = np.sqrt(mean_squared_error(test, predictions))
    print('Test RMSE: %.3f' % rmse)
    # plot forecasts against actual outcomes
    pyplot.plot(test)
    pyplot.plot(predictions, color='red')
    pyplot.show()
    return predictions,testY
        
        
        
#         test, testy = transformed_test[i],testY[i]
#         test = test.reshape(1,len(test))
#         yhat = arima_forecast(history ,trainY,test)
#         predictions.append(yhat)
# #         history.append(test)
#         history = np.vstack( [ history , transformed_test[i]] )
#         trainY = trainY.append(pd.Series(testY[i]),ignore_index= True)
#         print(testy, yhat)
        
    return predictions,testy
        
    

In [18]:
walk_forward_validation(out_data)

in timestamp of lags
in timestamp of lags
in timestamp of lags


ValueError: SARIMAX models require univariate `endog`. Got shape (336421, 14).