# Creating a single training data source

Dataset: [Credit Card Fraud Detection- Kaggle](https://www.kaggle.com/competitions/nus-fintech-recruitment/data)

## Importing modules

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from warnings import filterwarnings

## Data paths

In [13]:
train_path = '../Data/train.csv'
test_path = '../Data/test.csv'
customer_path = '../Data/customer.csv'
terminal_path = '../Data/terminal.csv'

## Checking each data source file

In [14]:
train = pd.read_csv(train_path)
train.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD
0,59383,2021-08-01 00:04:37,323,217,4.6,0
1,59384,2021-08-01 00:12:10,6,429,8.61,0
2,59385,2021-08-01 00:12:34,714,1011,64.0,0
3,59386,2021-08-01 00:15:40,266,1969,12.72,0
4,59387,2021-08-01 00:16:01,890,1482,98.88,0


In [15]:
customer = pd.read_csv(customer_path)
customer.head()

Unnamed: 0,CUSTOMER_ID,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals
0,0,54.88135,71.518937,62.262521,31.13126,2.179533,"[29, 87, 144, 241, 330, 858, 996, 1028, 1067, ...",22
1,1,42.36548,64.589411,46.570785,23.285393,3.567092,"[5, 160, 242, 378, 431, 475, 571, 762, 876, 93...",20
2,2,96.366276,38.344152,80.213879,40.106939,2.11558,"[316, 406, 447, 523, 968, 1200, 1318, 1365, 16...",10
3,3,56.804456,92.559664,11.748426,5.874213,0.348517,"[65, 94, 113, 364, 401, 433, 485, 651, 672, 77...",17
4,4,2.02184,83.261985,78.924891,39.462446,3.480049,"[372, 614, 774, 1362, 1446, 1564, 1637, 1939]",8


In [16]:
terminal = pd.read_csv(terminal_path)
terminal.head()

Unnamed: 0,TERMINAL_ID,x_terminal_id,y_terminal_id
0,0,41.7022,72.032449
1,1,0.011437,30.233257
2,2,14.675589,9.233859
3,3,18.626021,34.556073
4,4,39.676747,53.881673


In [17]:
test = pd.read_csv(test_path)
test.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT
0,352590,2022-01-01 00:03:14,208,640,147.24
1,352591,2022-01-01 00:26:39,851,315,13.27
2,352592,2022-01-01 00:30:30,454,27,70.52
3,352593,2022-01-01 00:31:31,677,1573,44.75
4,352594,2022-01-01 00:56:18,322,1398,37.6


## Creating training data

### Droping aviailable terminals from customer data

In [18]:
customer.drop(['available_terminals'], axis=1, inplace=True)
customer.head(2)

Unnamed: 0,CUSTOMER_ID,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,nb_terminals
0,0,54.88135,71.518937,62.262521,31.13126,2.179533,22
1,1,42.36548,64.589411,46.570785,23.285393,3.567092,20


### Merging train and customer data

In [20]:
train = pd.merge(train, customer, on='CUSTOMER_ID', how='left')
train.head(2)

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,nb_terminals
0,59383,2021-08-01 00:04:37,323,217,4.6,0,84.515409,38.276422,7.353061,3.67653,3.324124,19
1,59384,2021-08-01 00:12:10,6,429,8.61,0,11.827443,63.992102,18.618562,9.309281,3.778676,16


### Merging train and terminal data

In [21]:
train_df = pd.merge(train, terminal, on='TERMINAL_ID', how='left')

## Final data set

In [23]:
train_df.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,nb_terminals,x_terminal_id,y_terminal_id
0,59383,2021-08-01 00:04:37,323,217,4.6,0,84.515409,38.276422,7.353061,3.67653,3.324124,19,80.316334,40.239252
1,59384,2021-08-01 00:12:10,6,429,8.61,0,11.827443,63.992102,18.618562,9.309281,3.778676,16,15.172487,63.912685
2,59385,2021-08-01 00:12:34,714,1011,64.0,0,75.221083,94.991427,82.620413,41.310207,3.723765,13,74.196424,98.288079
3,59386,2021-08-01 00:15:40,266,1969,12.72,0,51.122179,8.329098,9.852171,4.926085,3.862067,15,51.950635,6.563484
4,59387,2021-08-01 00:16:01,890,1482,98.88,0,62.777887,31.40527,83.660035,41.830018,3.128315,20,62.417972,26.483666


In [25]:
train_df.to_csv('../Data/final_train.csv', index=False)

In [26]:
## Checking final saved data file
df = pd.read_csv('../Data/final_train.csv')
df.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,nb_terminals,x_terminal_id,y_terminal_id
0,59383,2021-08-01 00:04:37,323,217,4.6,0,84.515409,38.276422,7.353061,3.67653,3.324124,19,80.316334,40.239252
1,59384,2021-08-01 00:12:10,6,429,8.61,0,11.827443,63.992102,18.618562,9.309281,3.778676,16,15.172487,63.912685
2,59385,2021-08-01 00:12:34,714,1011,64.0,0,75.221083,94.991427,82.620413,41.310207,3.723765,13,74.196424,98.288079
3,59386,2021-08-01 00:15:40,266,1969,12.72,0,51.122179,8.329098,9.852171,4.926085,3.862067,15,51.950635,6.563484
4,59387,2021-08-01 00:16:01,890,1482,98.88,0,62.777887,31.40527,83.660035,41.830018,3.128315,20,62.417972,26.483666
