In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import statistics

# Native libraries
import os
import math


import warnings
warnings.filterwarnings('ignore')

In [2]:
def column_processing(_dataframe):
    _dataframe = _dataframe.drop(['Unnamed: 0','first','last','gender','lat','long',
                                  'city_pop','job','dob','unix_time','merch_lat','merch_long',
                                  'street','city','state','zip','trans_num','merchant'                      
                                  ],axis= True)
    
    _dataframe['trans_date_trans_time'] = pd.to_datetime(_dataframe['trans_date_trans_time'])

    _dataframe['day_of_week'] = _dataframe['trans_date_trans_time'].dt.dayofweek
    _dataframe['week_of_year'] = _dataframe['trans_date_trans_time'].dt.isocalendar()['week']
    _dataframe['hour_of_day'] = _dataframe['trans_date_trans_time'].dt.hour
    _dataframe['month_of_year'] = _dataframe['trans_date_trans_time'].dt.month
    _dataframe['quarter_of_year'] = _dataframe['trans_date_trans_time'].dt.quarter
    
    # Create a new feature to label transactions as weekday (0) or weekend (1)
    _dataframe['is_weekend'] = (_dataframe['day_of_week'] >= 5).astype(int)



    return _dataframe

def create_temporal_columns(_dataframe_o):
    # Convert 'Date' column to datetime format
    _dataframe_o['trans_date_trans_time'] = pd.to_datetime(_dataframe_o['trans_date_trans_time'])

    # Sort the DataFrame by 'Date'
    _dataframe_o = _dataframe_o.sort_values(by=['trans_date_trans_time'])

    cc_num_vector = _dataframe_o['cc_num'].unique()
    
    df_return = pd.DataFrame()

    for cc in cc_num_vector:

        cc_mask = _dataframe_o['cc_num'].isin([cc])
        _dataframe = _dataframe_o[cc_mask].reset_index(drop=True)


        _col_names = ['time between transactions','time between transactions categories']
        
        for c in _col_names:
            _dataframe[c] = _dataframe['trans_date_trans_time'] - _dataframe['trans_date_trans_time'].shift(1)
            _dataframe[c] = _dataframe[c].dt.total_seconds()
            _dataframe[c] = _dataframe[c].fillna(0)

        for month in _dataframe['month_of_year'].unique():
            for day in _dataframe[_dataframe['month_of_year'] == month]['day_of_week'].unique():
                n = _dataframe[(_dataframe['day_of_week'] == day) & (_dataframe['month_of_year'] == month) ].index[0]
                _dataframe['time between transactions'].loc[n] = 0

        _dataframe = _dataframe.sort_values(by=['category','trans_date_trans_time'])
        _dataframe = _dataframe.reset_index(drop= True)

        _dataframe['daily_TBT_cat'] = 0 
        _dataframe['weekly_TBT_cat'] = 0
        _dataframe['monthly_TBT_cat'] = 0

        for cat in _dataframe['category'].unique():
            _cat_index = _dataframe[_dataframe['category'] == cat].index
            n = _dataframe[_dataframe['category'] == cat].index[0]

            _dataframe['time between transactions categories'].loc[n] = 0

            _dataframe['daily_TBT_cat'].loc[_cat_index] = _dataframe[_dataframe['category'] == cat].groupby(['cc_num','month_of_year', 'day_of_week'])['time between transactions categories'].transform(lambda x: x.expanding().std())
            _dataframe['weekly_TBT_cat'].loc[_cat_index] = _dataframe[_dataframe['category'] == cat].groupby(['cc_num','week_of_year'])['time between transactions categories'].transform(lambda x: x.expanding().std())
            _dataframe['monthly_TBT_cat'].loc[_cat_index] = _dataframe[_dataframe['category'] == cat].groupby(['cc_num','month_of_year'])['time between transactions categories'].transform(lambda x: x.expanding().std())

            _dataframe['daily_TBT_cat'].loc[n] = 0
            _dataframe['weekly_TBT_cat'].loc[n] = 0
            _dataframe['monthly_TBT_cat'].loc[n] = 0


        _dataframe['log time between transactions categories'] = np.log( _dataframe['time between transactions categories'])
        _dataframe['log time between transactions'] = np.log( _dataframe['time between transactions'])
        _dataframe = _dataframe.fillna(0)
        _dataframe = _dataframe.replace(-np.inf,0)
        _dataframe = _dataframe.replace(np.inf,0)

        df_return = pd.concat([df_return, _dataframe])

        

    return df_return

def normalized_amt(_dataframe):

    _dataframe = _dataframe.reset_index(drop = True)

    _dataframe['normalized_amt']=(_dataframe['amt']-_dataframe['amt'].mean())/_dataframe['amt'].std()

    for cc in _dataframe['cc_num'].unique():
        for month in _dataframe[_dataframe['cc_num'] == cc]['month_of_year'].unique():
            for cat in _dataframe[(_dataframe['cc_num'] == cc) & (_dataframe['month_of_year'] == month)]['category'].unique():

                m = _dataframe[(_dataframe['category'] == cat)  & (_dataframe['cc_num'] == cc) & (_dataframe['is_fraud'] == False) & (_dataframe['month_of_year'] == month)]['amt'].mean()
                s = _dataframe[(_dataframe['category'] == cat)  & (_dataframe['cc_num'] == cc) & (_dataframe['is_fraud'] == False) & (_dataframe['month_of_year'] == month)]['amt'].std()
                x = _dataframe[(_dataframe['category'] == cat)  & (_dataframe['cc_num'] == cc) & (_dataframe['is_fraud'] == False) & (_dataframe['month_of_year'] == month)]['amt']

                _dataframe[(_dataframe['category'] == cat) & (_dataframe['cc_num'] == cc) & (_dataframe['month_of_year'] == month)]['normalized_amt_cat'] = (x - m)/s

    return _dataframe

def added_amt_columns(_dataframe):
        
    df_return = pd.DataFrame()
    
    for cc in _dataframe['cc_num'].unique():
        
        df_p = _dataframe[_dataframe['cc_num'] == cc].sort_values(by=['cc_num', 'trans_date_trans_time']).reset_index(drop=True)

        # Cumulative sums
        df_p['month added amt'] = df_p.groupby(['cc_num', 'month_of_year'])['amt'].cumsum()
        df_p['weekly added amt'] = df_p.groupby(['cc_num', 'month_of_year', 'week_of_year'])['amt'].cumsum()
        
        # Rolling standard deviations
        df_p['monthly_std_amt'] = df_p.groupby(['cc_num', 'month_of_year'])['amt'].transform(lambda x: x.expanding().std())
        df_p['weekly_std_amt'] = df_p.groupby(['cc_num', 'week_of_year'])['amt'].transform(lambda x: x.expanding().std())

        # Create columns for each category adding by
        categories = df_p['category'].unique()
        for cat in categories:
            df_p[f'monthly_acc_amt_{cat}'] = df_p[df_p['category'] == cat].groupby(['cc_num', 'month_of_year'])['amt'].transform(lambda x: x.cumsum().mask(x.duplicated(keep='last')))
            df_p[f'weekly_acc_amt_{cat}'] = df_p[df_p['category'] == cat].groupby(['cc_num', 'week_of_year'])['amt'].transform(lambda x: x.cumsum().mask(x.duplicated(keep='last')))
            df_p[f'daily_acc_amt_{cat}'] = df_p[df_p['category'] == cat].groupby(['cc_num','month_of_year', 'day_of_week'])['amt'].transform(lambda x: x.cumsum().mask(x.duplicated(keep='last')))

            df_p[f'monthly_std_amt_{cat}'] = df_p[df_p['category'] == cat].groupby(['cc_num', 'month_of_year'])['amt'].transform(lambda x: x.expanding().std())
            df_p[f'weekly_std_amt_{cat}'] = df_p[df_p['category'] == cat].groupby(['cc_num', 'week_of_year'])['amt'].transform(lambda x: x.expanding().std())
            df_p[f'daily_std_amt_{cat}'] = df_p[df_p['category'] == cat].groupby(['cc_num','month_of_year', 'day_of_week'])['amt'].transform(lambda x: x.expanding().std())


            # Apply forward fill to handle NaN values
            columns_to_fill = [f'monthly_acc_amt_{cat}',f'weekly_acc_amt_{cat}',f'daily_acc_amt_{cat}',f'monthly_std_amt_{cat}',f'weekly_std_amt_{cat}',f'daily_std_amt_{cat}']

            df_p[columns_to_fill] = df_p[columns_to_fill].ffill()

        df_p = df_p.fillna(0)

        df_return = pd.concat([df_return, df_p])
    
    return df_return


    
    

In [263]:
df=pd.read_csv("Credit_Card/fraudTest.csv")

In [306]:
# Define the initial vector of cc_num values
cc_num_vector = [30270432095985]#,630423337322,6538441737335434]

# Create a boolean mask to filter rows for the specified cc_num values
mask = df['cc_num'].isin(cc_num_vector)

# Create a separate DataFrame for the specified cc_num values
df_dummy = df[mask].reset_index(drop=True)

In [307]:
df_dummy = column_processing(df_dummy)

In [308]:
df_dummy_fraud = df_dummy[df_dummy['is_fraud'] == True]

In [309]:
first_fraud_month = df_dummy_fraud['month_of_year'].min()

In [310]:
df_dummy_post_fraud_transactions = df_dummy[df_dummy['month_of_year'] >= first_fraud_month]
df_dummy = df_dummy[df_dummy['month_of_year'] < first_fraud_month]

In [311]:
df_dummy = create_temporal_columns(df_dummy)
df_dummy_post_fraud_transactions = create_temporal_columns(df_dummy_post_fraud_transactions)

In [315]:
def other_added_cols(df_dummy):
    df_dummy['cat_add_amt'] = 0
    df_dummy['cat_std_amt'] = 0
    df_y = pd.DataFrame()

    for month in df_dummy['month_of_year'].unique():
        for cat in df_dummy[df_dummy['month_of_year'] == month]['category'].unique():

            df_x = df_dummy[(df_dummy['category'] == cat) & (df_dummy['month_of_year'] == month) ]
            df_x = df_x.sort_values(by=['trans_date_trans_time'])
            
            df_x['cat_std_amt'] = df_x['amt'].transform(lambda x: x.expanding().std())
            df_x['cat_add_amt'] = df_x['amt'].cumsum()

            df_y = pd.concat([df_y, df_x]).fillna(0)

        

    return df_y


In [316]:
df_dummy = other_added_cols(df_dummy)
df_dummy_post_fraud_transactions = other_added_cols(df_dummy_post_fraud_transactions)

In [317]:
df_single = df_dummy[['trans_date_trans_time','category','amt','is_fraud','day_of_week','hour_of_day',
                                                         'is_weekend','time between transactions','time between transactions categories',
                                                         'daily_TBT_cat','weekly_TBT_cat','monthly_TBT_cat','log time between transactions categories',
                                                         'log time between transactions','cat_add_amt','cat_std_amt']].reset_index(drop=True)

df_single_post_fraud = df_dummy_post_fraud_transactions[['trans_date_trans_time','category','amt','is_fraud','day_of_week','hour_of_day',
                                                         'is_weekend','time between transactions','time between transactions categories',
                                                         'daily_TBT_cat','weekly_TBT_cat','monthly_TBT_cat','log time between transactions categories',
                                                         'log time between transactions','cat_add_amt','cat_std_amt']].reset_index(drop=True)

In [318]:
y_test = df_single_post_fraud['is_fraud']

In [319]:
Cats = ['amt', 'category', 'day_of_week', 'is_weekend', 'hour_of_day','log time between transactions','weekly_TBT_cat','cat_add_amt','cat_std_amt']

X = df_single[Cats]
X_test = df_single_post_fraud[Cats]

In [320]:
X = pd.get_dummies(X, columns=['category'], prefix='cat_')
X_test = pd.get_dummies(X_test, columns=['category'], prefix='cat_')


In [321]:
X = pd.get_dummies(X, columns=['hour_of_day'], prefix='hour')
X_test = pd.get_dummies(X_test, columns=['hour_of_day'], prefix='hour')

In [322]:
X = pd.get_dummies(X, columns=['day_of_week'], prefix='day')
X_test = pd.get_dummies(X_test, columns=['day_of_week'], prefix='day')

In [323]:
X_test


Unnamed: 0,amt,is_weekend,log time between transactions,weekly_TBT_cat,cat_add_amt,cat_std_amt,cat__entertainment,cat__food_dining,cat__gas_transport,cat__grocery_net,...,hour_21,hour_22,hour_23,day_0,day_1,day_2,day_3,day_4,day_5,day_6
0,13.30,0,9.519808,0.000000,13.30,0.000000,True,False,False,False,...,True,False,False,False,False,False,False,True,False,False
1,4.60,0,6.738152,0.000000,17.90,6.151829,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
2,41.03,1,8.217169,2022.325394,58.93,19.025368,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,13.45,0,10.368604,0.000000,72.38,15.839795,True,False,False,False,...,True,False,False,False,False,False,True,False,False,False
4,34.68,1,8.858795,17541.905028,107.06,15.594447,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,5.18,0,7.375882,0.000000,5.18,0.000000,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
900,7.61,0,8.222285,0.000000,12.79,1.718269,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
901,6.74,0,8.597851,0.000000,19.53,1.231219,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
902,2.34,1,7.686621,0.000000,21.87,2.314698,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [324]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_columns = ['amt','log time between transactions','weekly_TBT_cat','cat_add_amt','cat_std_amt']
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
X_test[numerical_columns] = scaler.fit_transform(X_test[numerical_columns])



In [325]:
from sklearn.ensemble import IsolationForest
model = IsolationForest(contamination=0.05)  # Set the contamination parameter based on your dataset characteristics
model.fit(X)


In [326]:
from sklearn.metrics import roc_auc_score
y_score = model.decision_function(X_test)
auc_score = roc_auc_score(y_test, y_score)
print(f"AUC Score: {auc_score}")


AUC Score: 0.33130858642669664
