In [1]:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Cleaned_data.csv')

In [3]:
df['Date'] = pd.to_datetime(df['Date'])


df.sort_values(['Recnum', 'Date'], inplace=True)


df['Day_of_Week'] = df['Date'].dt.dayofweek
df['Month'] = df['Date'].dt.month
df['Weekend'] = (df['Day_of_Week'] >= 5).astype(int)
df['Year'] = df['Date'].dt.year
df['Day'] = df['Date'].dt.day


df['Avg_Transaction_Amount'] = df.groupby('Cardnum')['Amount'].transform(lambda x: x.expanding().mean())


df['Amount_Deviation'] = df['Amount'] - df['Avg_Transaction_Amount']

df['Min_Amount'] = df.groupby('Cardnum')['Amount'].transform(lambda x: x.expanding().min())
df['Max_Amount'] = df.groupby('Cardnum')['Amount'].transform(lambda x: x.expanding().max())


df['Merchant_Frequency_Week'] = df.groupby(['Merchnum', pd.Grouper(key='Date', freq='W')])['Recnum'].transform('count')
df['Merchant_Frequency_Month'] = df.groupby(['Merchnum', pd.Grouper(key='Date', freq='M')])['Recnum'].transform('count')

df['Card_Frequency_Week'] = df.groupby(['Cardnum', pd.Grouper(key='Date', freq='W')])['Recnum'].transform('count')
df['Card_Frequency_Month'] = df.groupby(['Cardnum', pd.Grouper(key='Date', freq='M')])['Recnum'].transform('count')


df['Previous_Fraud'] = df.groupby('Cardnum')['Fraud'].shift().fillna(0)


df['Days_Since_Card_Seen'] = df.groupby('Cardnum').cumcount()


df['Days_Since_Merchant_Appeared'] = df.groupby('Merchnum').cumcount()


df['Days_Since_Card_Merch_Combination'] = df.groupby(['Cardnum', 'Merchnum']).cumcount()


df = df[['Recnum', 'Cardnum','Year', 'Day','Day_of_Week', 'Month', 'Weekend', 
         'Amount', 'Avg_Transaction_Amount', 'Amount_Deviation', 'Min_Amount', 'Max_Amount', 
         'Merchnum', 'Merch description', 'Merch state', 'Merch zip', 
         'Merchant_Frequency_Week', 'Merchant_Frequency_Month',
         'Card_Frequency_Week', 'Card_Frequency_Month',
         'Previous_Fraud', 'Days_Since_Card_Seen',
         'Days_Since_Merchant_Appeared', 'Days_Since_Card_Merch_Combination', 'Fraud']]


In [4]:
df.to_csv('Data_with_features.csv')

In [5]:
oot = df[df['Month']>10]
df = df[df['Month'] < 11]

In [6]:
cat_cols = ['Cardnum','Day_of_Week', 'Month', 'Merchnum', 'Merch description', 'Merch state', 'Merch zip', 'Previous_Fraud']
df[cat_cols] = df[cat_cols].astype("category")
oot[cat_cols] = oot[cat_cols].astype("category")

In [7]:
encoder = ce.BinaryEncoder(cols=cat_cols)
df_binary = encoder.fit_transform(df[cat_cols])
oot_binary = encoder.transform(oot[cat_cols])

In [8]:
df = df.drop(cat_cols, axis=1)
oot = oot.drop(cat_cols, axis=1)
df = pd.concat([df, df_binary],axis=1)
oot = pd.concat([oot, oot_binary],axis=1)
df = df.fillna(0)
oot = oot.fillna(0)

In [9]:
X = df.drop(['Fraud','Recnum'], axis=1)
Y = df['Fraud']
X_oot = oot.drop(['Fraud','Recnum'], axis=1)
y_oot = oot['Fraud']
X_train, X_test, y_train, y_test = train_test_split(X, df['Fraud'], test_size=0.2, random_state= 125, stratify=Y)

# Checking the shapes of the resulting arrays
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("X_oot shape:", X_oot.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("y_oot shape:", y_oot.shape)

X_train shape: (66680, 82)
X_test shape: (16671, 82)
X_oot shape: (12289, 82)
y_train shape: (66680,)
y_test shape: (16671,)
y_oot shape: (12289,)


In [10]:
X_train.to_csv('X_Train.csv')
y_train.to_csv('y_train.csv')
X_test.to_csv('X_test.csv')
y_test.to_csv('y_test.csv')
X_oot.to_csv('X_oot.csv')
y_oot.to_csv('y_oot.csv')

In [13]:
y_train.shape

(66680,)