# Team members
- Bryce Grahn
- Michael Rolle
- Werner de jager
- Abdul Gany Osman
- Lavania Naidoo

# Introduction
The aim of this notebook is undertaking a machine learning investigation on the....

# 1. Imports and installations

In [1]:
# imports
import os
import math
import numpy as np
import scipy as sp
import pandas as pd
import sympy as sym
import seaborn as sns
import sklearn as sk
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss
from sklearn.linear_model import LinearRegression, SGDRegressor, SGDClassifier, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from itertools import chain
from scipy.stats.mstats import winsorize

# define plt settings 
plt.rcParams["font.size"] = 20 
plt.rcParams["axes.labelsize"] = 20 
plt.rcParams["xtick.labelsize"] = 20 
plt.rcParams["ytick.labelsize"] = 20 
plt.rcParams["legend.fontsize"] = 20 
plt.rcParams["figure.figsize"] = (20,10)

# define seaborn settings seaborn
sns.set(style="ticks", color_codes=True)
sns.set_palette("husl")

## 1.1 Import Dataset

In [2]:
import os
os.listdir("/kaggle/input/payments-hackathon-rojones/Payments Fraud DataSet/")

['transactions_test.csv',
 'transactions_train.csv',
 'terminals.csv',
 'merchants.csv',
 'customers.csv']

In [3]:
url = '/kaggle/input/payments-hackathon-rojones/Payments Fraud DataSet/'

customers_df = pd.read_csv(url + "customers.csv", index_col = 'CUSTOMER_ID')
terminals_df = pd.read_csv(url + "terminals.csv", index_col = 'TERMINAL_ID')
merchants_df = pd.read_csv(url + "merchants.csv", index_col = 'MERCHANT_ID')
transactions_test_df = pd.read_csv(url + "transactions_test.csv", index_col = 'TX_ID')
transactions_train_df = pd.read_csv(url + "transactions_test.csv", index_col = 'TX_ID')

## 1.2 Display datasets

In [4]:
print("Customers dataset:")
print("----------------------")
display(customers_df.info())
print("\nterminals dataset:")
print("----------------------")
display(terminals_df.info())
print("\nmerchants dataset:")
print("----------------------")
display(merchants_df.info())
print("\ntransactions_test dataset:")
print("----------------------")
display(transactions_test_df.info())
print("\ntransactions_train dataset:")
print("----------------------")
display(transactions_train_df.info())

Customers dataset:
----------------------
<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, 7894622031164826 to 5987355644780185
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   x_customer_id  50000 non-null  float64
 1   y_customer_id  50000 non-null  float64
dtypes: float64(2)
memory usage: 1.1 MB


None


terminals dataset:
----------------------
<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 9754224 to 14982407
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   x_terminal_id   5000 non-null   float64
 1   y_terminal__id  5000 non-null   float64
dtypes: float64(2)
memory usage: 117.2 KB


None


merchants dataset:
----------------------
<class 'pandas.core.frame.DataFrame'>
Index: 30452 entries, 1b7557bb-208b-4fdc-ba91-eca98298541a to cb5826ba-6a40-4f7f-9b53-ae7f3e35e740
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   BUSINESS_TYPE                       30452 non-null  object
 1   MCC_CODE                            30452 non-null  int64 
 2   LEGAL_NAME                          30452 non-null  object
 3   FOUNDATION_DATE                     30452 non-null  object
 4   TAX_EXCEMPT_INDICATOR               30452 non-null  bool  
 5   OUTLET_TYPE                         30452 non-null  object
 6   ACTIVE_FROM                         30452 non-null  object
 7   TRADING_FROM                        30452 non-null  object
 8   ANNUAL_TURNOVER_CARD                30452 non-null  int64 
 9   ANNUAL_TURNOVER                     30452 non-null  int64 
 10  AVERAGE_TICKET

None


transactions_test dataset:
----------------------
<class 'pandas.core.frame.DataFrame'>
Index: 71139 entries, 09324d812ba7915c3f791e973db293ad50db70d8 to 4af7a6f698e30cecc97ac5254315c97db971bf56
Data columns (total 20 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   TX_TS                                  71139 non-null  object 
 1   CUSTOMER_ID                            71139 non-null  int64  
 2   TERMINAL_ID                            71139 non-null  int64  
 3   TX_AMOUNT                              71139 non-null  float64
 4   TRANSACTION_GOODS_AND_SERVICES_AMOUNT  71139 non-null  float64
 5   TRANSACTION_CASHBACK_AMOUNT            71139 non-null  float64
 6   CARD_EXPIRY_DATE                       71139 non-null  object 
 7   CARD_DATA                              71139 non-null  object 
 8   CARD_BRAND                             71139 non-null  object 
 9   TRANSACTION_TYPE  

None


transactions_train dataset:
----------------------
<class 'pandas.core.frame.DataFrame'>
Index: 71139 entries, 09324d812ba7915c3f791e973db293ad50db70d8 to 4af7a6f698e30cecc97ac5254315c97db971bf56
Data columns (total 20 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   TX_TS                                  71139 non-null  object 
 1   CUSTOMER_ID                            71139 non-null  int64  
 2   TERMINAL_ID                            71139 non-null  int64  
 3   TX_AMOUNT                              71139 non-null  float64
 4   TRANSACTION_GOODS_AND_SERVICES_AMOUNT  71139 non-null  float64
 5   TRANSACTION_CASHBACK_AMOUNT            71139 non-null  float64
 6   CARD_EXPIRY_DATE                       71139 non-null  object 
 7   CARD_DATA                              71139 non-null  object 
 8   CARD_BRAND                             71139 non-null  object 
 9   TRANSACTION_TYPE 

None