In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import  MinMaxScaler

In [12]:
transaction_df = pd.read_csv('original_data.csv')
transaction_df.replace("'",'', regex=True, inplace=True) 
#Drop unused columns
transaction_df.drop(columns = ['step', 'age', 'gender', 'zipcodeOri', 'zipMerchant'], inplace=True)
#Split data to train, test
train, test = train_test_split(transaction_df, random_state=42)
train.head()

Unnamed: 0,customer,merchant,category,amount,fraud
458422,C1060470494,M1823072687,es_transportation,40.54,0
102312,C1574217480,M1823072687,es_transportation,38.63,0
253447,C931370744,M1823072687,es_transportation,61.62,0
585032,C169274898,M85975013,es_food,41.24,0
136214,C98091448,M348934600,es_transportation,4.74,0


In [13]:
# Create merchant fraud rate, 1 - 5 value of fraud rates in merchants in train set
mechant_fraud_rate = pd.cut(train.groupby('merchant').mean('fraud')['fraud'], bins = 5, labels=range(5))
train['merchant fraud rate'] = train['merchant'].apply(lambda x: mechant_fraud_rate.get(x))
test['merchant fraud rate'] = test['merchant'].apply(lambda x: mechant_fraud_rate.get(x))

In [14]:
# Create previous fraud per customer. 1 if there was fraud in the test set, 0 if not
customer_previous_fraud = pd.cut(train.groupby('customer').mean('fraud')['fraud'], bins = 5, labels=range(5))
train['previous fraud'] = train['customer'].apply(lambda x: customer_previous_fraud.get(x))
test['previous fraud'] = test['customer'].apply(lambda x: customer_previous_fraud.get(x))

In [15]:
# Create category fraud rate, 1 - 5 value of fraud rate in train set
category_fraud_rate = pd.cut(train.groupby('category').mean('fraud')['fraud'], bins = 5, labels = range(5))
train['category fraud rate'] = train['category'].apply(lambda x: category_fraud_rate.get(x))
test['category fraud rate'] = test['category'].apply(lambda x: category_fraud_rate.get(x))

In [16]:
train.drop(columns=['customer', 'merchant', 'category'], inplace=True)
test.drop(columns=['customer', 'merchant', 'category'], inplace=True)
train.head()

Unnamed: 0,amount,fraud,merchant fraud rate,previous fraud,category fraud rate
458422,40.54,0,0,0,0
102312,38.63,0,0,0,0
253447,61.62,0,0,0,0
585032,41.24,0,0,0,0
136214,4.74,0,0,0,0


In [17]:
# Make feature and target vectors
y_train = train['fraud']
X_train = train.drop(columns = ['fraud'])
y_test = test['fraud']
X_test = test.drop(columns = ['fraud'])

In [18]:
#Fit transformer to data
MMscaler = MinMaxScaler()
X_train_transformed = MMscaler.fit_transform(X_train)
X_test_transformed = MMscaler.transform(X_test)

In [19]:
X_train_df = pd.DataFrame(X_train_transformed)
X_train_df.to_csv('../active_datasets/bbb_train.csv', index = None, header=None)
X_test_df = pd.DataFrame(X_test_transformed)
X_test_df.to_csv('../active_datasets/bbb_test.csv', index = None, header=None)