# 1. Load and Inspect the Data

In [1]:
# Import the modules
import numpy as np # For numerical operations and calculations
import pandas as pd # To read and manipulate the lending data as a dataframe
from pathlib import Path # To specify the the file path for reading the csv file
from sklearn.preprocessing import StandardScaler # To scale the data
import seaborn as sns # To create pairplots and heatmaps to visualize data relationships and correlations
import matplotlib.pyplot as plt # To create and display visualizations, including heatmaps and confusion matrices
from scipy import stats # To calculate the Pearson correlation coefficient
from statsmodels.stats.outliers_influence import variance_inflation_factor # To test for multicolinearity in independant variables

In [2]:
# Reading the credit card transaction data file from the resources folder into a pandas dataframe
fraud_df = pd.read_csv(Path("sample1.csv"))

# Review the DataFrame
fraud_df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long
0,2019-10-19 20:59:57,503848303379,fraud_Dooley Inc,shopping_pos,4.94,Gregory,Wallace,M,27203 Darrell Landing,Saint James City,...,33956,26.529,-82.0916,3776,Sport and exercise psychologist,1976-09-12,32a2a8c3df8da577d4cfdd187f9e7c15,1350680397,26.431843,-82.196152
1,2019-02-24 23:53:14,345060451158891,"fraud_Cole, Hills and Jewess",home,83.69,Brian,Perez,M,78652 Scott Ports,Ashfield,...,1330,42.5232,-72.811,1506,"Production assistant, radio",1960-04-03,124de445fe369a24ef833a45a38283ea,1330127594,42.934125,-71.972246
2,2019-06-15 12:11:39,4939976756738216,fraud_Sawayn PLC,shopping_pos,41.52,Michelle,Johnston,F,3531 Hamilton Highway,Roma,...,78584,26.4215,-99.0025,18128,IT trainer,1990-11-07,93a5291588d4b24c1c9f76a3aa4bc157,1339762299,27.358196,-98.173362
3,2019-06-17 14:35:11,3533742182628021,fraud_Mohr-Bayer,shopping_net,1.35,Robert,Haynes,M,857 Aaron Circles Suite 398,Johns Island,...,29455,32.8357,-79.8217,20478,Materials engineer,1997-06-04,e037ac8a7d6b5b16c2eb9c789aab827f,1339943711,33.153224,-80.232591
4,2019-12-30 23:40:41,571465035400,fraud_Schmidt and Sons,shopping_net,1115.07,Louis,Fisher,M,45654 Hess Rest,Fort Washakie,...,82514,43.0048,-108.8964,1645,Freight forwarder,1976-02-26,858e5361ef522f2ad556065b78db7af8,1356910841,43.850309,-108.448989


In [3]:
# Check if there are any null values in the dataframe
if fraud_df.isnull().any().any():
    print("There are null values in the DataFrame.")
else:
    print("No null values found in the DataFrame.")

No null values found in the DataFrame.


In [4]:
# Drop the cc_num and trans_num columns as credit numbers are randomly generated by the banks and 
# have no link to whether fraud will be committed
fraud_df.drop(['cc_num','trans_num'], axis=1, inplace=True)
fraud_df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long
0,2019-10-19 20:59:57,fraud_Dooley Inc,shopping_pos,4.94,Gregory,Wallace,M,27203 Darrell Landing,Saint James City,FL,33956,26.529,-82.0916,3776,Sport and exercise psychologist,1976-09-12,1350680397,26.431843,-82.196152
1,2019-02-24 23:53:14,"fraud_Cole, Hills and Jewess",home,83.69,Brian,Perez,M,78652 Scott Ports,Ashfield,MA,1330,42.5232,-72.811,1506,"Production assistant, radio",1960-04-03,1330127594,42.934125,-71.972246
2,2019-06-15 12:11:39,fraud_Sawayn PLC,shopping_pos,41.52,Michelle,Johnston,F,3531 Hamilton Highway,Roma,TX,78584,26.4215,-99.0025,18128,IT trainer,1990-11-07,1339762299,27.358196,-98.173362
3,2019-06-17 14:35:11,fraud_Mohr-Bayer,shopping_net,1.35,Robert,Haynes,M,857 Aaron Circles Suite 398,Johns Island,SC,29455,32.8357,-79.8217,20478,Materials engineer,1997-06-04,1339943711,33.153224,-80.232591
4,2019-12-30 23:40:41,fraud_Schmidt and Sons,shopping_net,1115.07,Louis,Fisher,M,45654 Hess Rest,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Freight forwarder,1976-02-26,1356910841,43.850309,-108.448989


In [5]:
# Check datatypes of each column
fraud_df.dtypes

trans_date_trans_time     object
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
unix_time                  int64
merch_lat                float64
merch_long               float64
dtype: object

In [6]:
# Create is_fraud column that will contain 2x fraudulent transactions and the rest are non fraudulent
# This is so we can target encode the sample data in order to run it through the picked model

# Calculate the number of rows in fraud_df
total_rows = len(fraud_df)

# Create an array with two ones and fill the remaining space with zeros to match the total number of rows
array = np.array([1] * 2 + [0] * (total_rows - 2))

# Randomly shuffle the elements in the array
np.random.shuffle(array)

# Create a new DataFrame with a single column named "is_fraud" containing the shuffled array data.
new_column = pd.DataFrame({'is_fraud': array})

# Concatenate the new DataFrame with fraud_df
fraud_df = pd.concat([fraud_df, new_column], axis=1)
fraud_df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,2019-10-19 20:59:57,fraud_Dooley Inc,shopping_pos,4.94,Gregory,Wallace,M,27203 Darrell Landing,Saint James City,FL,33956,26.529,-82.0916,3776,Sport and exercise psychologist,1976-09-12,1350680397,26.431843,-82.196152,0
1,2019-02-24 23:53:14,"fraud_Cole, Hills and Jewess",home,83.69,Brian,Perez,M,78652 Scott Ports,Ashfield,MA,1330,42.5232,-72.811,1506,"Production assistant, radio",1960-04-03,1330127594,42.934125,-71.972246,0
2,2019-06-15 12:11:39,fraud_Sawayn PLC,shopping_pos,41.52,Michelle,Johnston,F,3531 Hamilton Highway,Roma,TX,78584,26.4215,-99.0025,18128,IT trainer,1990-11-07,1339762299,27.358196,-98.173362,0
3,2019-06-17 14:35:11,fraud_Mohr-Bayer,shopping_net,1.35,Robert,Haynes,M,857 Aaron Circles Suite 398,Johns Island,SC,29455,32.8357,-79.8217,20478,Materials engineer,1997-06-04,1339943711,33.153224,-80.232591,0
4,2019-12-30 23:40:41,fraud_Schmidt and Sons,shopping_net,1115.07,Louis,Fisher,M,45654 Hess Rest,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Freight forwarder,1976-02-26,1356910841,43.850309,-108.448989,0


## 2. Convert DateTime and Time columns into Unix Timestamps
### columns 'trans_date_trans_time'  and 'dob' 

In [7]:
# Convert the 'trans_date_trans_time' column to datetime objects
fraud_df['trans_date_trans_time'] = pd.to_datetime(fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')

# Convert the 'trans_date_trans_time' column to Unix timestamps
fraud_df['trans_date_trans_time'] = (fraud_df['trans_date_trans_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Convert the 'dob' column to datetime objects
fraud_df['dob'] = pd.to_datetime(fraud_df['dob'], format='%Y-%m-%d')

# Convert the 'dob' column to Unix timestamps
fraud_df['dob'] = (fraud_df['dob'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Check it worked
fraud_df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,1571518797,fraud_Dooley Inc,shopping_pos,4.94,Gregory,Wallace,M,27203 Darrell Landing,Saint James City,FL,33956,26.529,-82.0916,3776,Sport and exercise psychologist,211334400,1350680397,26.431843,-82.196152,0
1,1551052394,"fraud_Cole, Hills and Jewess",home,83.69,Brian,Perez,M,78652 Scott Ports,Ashfield,MA,1330,42.5232,-72.811,1506,"Production assistant, radio",-307584000,1330127594,42.934125,-71.972246,0
2,1560600699,fraud_Sawayn PLC,shopping_pos,41.52,Michelle,Johnston,F,3531 Hamilton Highway,Roma,TX,78584,26.4215,-99.0025,18128,IT trainer,657936000,1339762299,27.358196,-98.173362,0
3,1560782111,fraud_Mohr-Bayer,shopping_net,1.35,Robert,Haynes,M,857 Aaron Circles Suite 398,Johns Island,SC,29455,32.8357,-79.8217,20478,Materials engineer,865382400,1339943711,33.153224,-80.232591,0
4,1577749241,fraud_Schmidt and Sons,shopping_net,1115.07,Louis,Fisher,M,45654 Hess Rest,Fort Washakie,WY,82514,43.0048,-108.8964,1645,Freight forwarder,194140800,1356910841,43.850309,-108.448989,0


In [8]:
# Check datatypes
fraud_df.dtypes

trans_date_trans_time      int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                        int64
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int32
dtype: object

In [9]:
# Scale the numeric columns.
# Scaling the data is necessary to ensure that features with different units or magnitudes have an equal 
# influence on machine learning algorithms and to enable efficient convergence.

# Define the columns you want to scale (assuming they are all numeric)
columns_to_scale = ['trans_date_trans_time', 'amt','zip','lat','long','city_pop','dob','unix_time','merch_lat','merch_long']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on your data and transform the specified columns
fraud_df[columns_to_scale] = scaler.fit_transform(fraud_df[columns_to_scale])

# Check it worked
fraud_df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,-0.439028,fraud_Dooley Inc,shopping_pos,-0.439041,Gregory,Wallace,M,27203 Darrell Landing,Saint James City,FL,-0.460523,-2.487956,0.506891,-0.278382,Sport and exercise psychologist,0.214474,-0.437295,-2.462929,0.494765,0
1,-1.550205,"fraud_Cole, Hills and Jewess",home,0.033496,Brian,Perez,M,78652 Scott Ports,Ashfield,MA,-1.726776,0.904109,1.265039,-0.287463,"Production assistant, radio",-0.718534,-1.554888,0.965477,1.330373,0
2,-1.031801,fraud_Sawayn PLC,shopping_pos,-0.219544,Michelle,Johnston,F,3531 Hamilton Highway,Roma,TX,1.271542,-2.510754,-0.87459,-0.220967,IT trainer,1.017458,-1.030985,-2.270476,-0.811065,0
3,-1.021952,fraud_Mohr-Bayer,shopping_net,-0.460582,Robert,Haynes,M,857 Aaron Circles Suite 398,Johns Island,SC,-0.635212,-1.150425,0.692323,-0.211566,Materials engineer,1.390443,-1.02112,-1.066539,0.655249,0
4,-0.10076,fraud_Schmidt and Sons,shopping_net,6.222258,Louis,Fisher,M,45654 Hess Rest,Fort Washakie,WY,1.42407,1.006247,-1.68284,-0.286907,Freight forwarder,0.18356,-0.098504,1.155817,-1.6509,0


## 3. Implement target encoding for the individual categorical features and the 'is_fraud' target variable (except gender column)

In [10]:
# Implement target encoding for the 'merchant' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'merchant'
target_mean = fraud_df.groupby('merchant')['is_fraud'].mean()

# Replace merchant column with the target encoding
fraud_df['merchant'] = fraud_df['merchant'].map(target_mean)

In [11]:
# Implement target encoding for the 'category' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'job'
target_mean = fraud_df.groupby('category')['is_fraud'].mean()

# Replace category column with the target encoding
fraud_df['category'] = fraud_df['category'].map(target_mean)

In [12]:
# Implement target encoding for the 'first' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'first'
target_mean = fraud_df.groupby('first')['is_fraud'].mean()

# Replace first column with the target encoding
fraud_df['first'] = fraud_df['first'].map(target_mean)

In [13]:
# Implement target encoding for the 'last' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'last'
target_mean = fraud_df.groupby('last')['is_fraud'].mean()

# Replace last column with the target encoding
fraud_df['last'] = fraud_df['last'].map(target_mean)

In [14]:
# Implement target encoding for the 'street' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'street'
target_mean = fraud_df.groupby('street')['is_fraud'].mean()

# Replace street column with the target encoding
fraud_df['street'] = fraud_df['street'].map(target_mean)

In [15]:
# Implement target encoding for the 'city' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'city'
target_mean = fraud_df.groupby('city')['is_fraud'].mean()

# Replace city column with the target encoding
fraud_df['city'] = fraud_df['city'].map(target_mean)

In [16]:
# Implement target encoding for the 'state' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'state'
target_mean = fraud_df.groupby('state')['is_fraud'].mean()

# Replace state column with the target encoding
fraud_df['state'] = fraud_df['state'].map(target_mean)

In [17]:
# Implement target encoding for the 'job' feature and the 'is_fraud' target variable

# Calculate the mean 'is_fraud' for each 'job'
target_mean = fraud_df.groupby('job')['is_fraud'].mean()

# Replace job column with the target encoding
fraud_df['job'] = fraud_df['job'].map(target_mean)

# Check fraud_df
fraud_df

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,-0.439028,0.0,0.000000,-0.439041,0.0,0.0,M,0.0,0.0,0.0,-0.460523,-2.487956,0.506891,-0.278382,0.0,0.214474,-0.437295,-2.462929,0.494765,0
1,-1.550205,0.0,0.037037,0.033496,0.0,0.0,M,0.0,0.0,0.0,-1.726776,0.904109,1.265039,-0.287463,0.0,-0.718534,-1.554888,0.965477,1.330373,0
2,-1.031801,0.0,0.000000,-0.219544,0.0,0.0,F,0.0,0.0,0.0,1.271542,-2.510754,-0.874590,-0.220967,0.0,1.017458,-1.030985,-2.270476,-0.811065,0
3,-1.021952,0.0,0.000000,-0.460582,0.0,0.0,M,0.0,0.0,0.0,-0.635212,-1.150425,0.692323,-0.211566,0.0,1.390443,-1.021120,-1.066539,0.655249,0
4,-0.100760,0.0,0.000000,6.222258,0.0,0.0,M,0.0,0.0,0.0,1.424070,1.006247,-1.682840,-0.286907,0.0,0.183560,-0.098504,1.155817,-1.650900,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.855272,0.0,0.000000,0.328959,0.0,0.0,F,0.0,0.0,0.0,-1.178801,0.361267,0.695297,-0.292751,0.0,-1.536275,0.854308,0.154477,0.631290,0
296,-1.324280,0.0,0.000000,-0.455422,0.0,0.0,F,0.0,0.0,0.0,-1.178452,0.345276,0.688426,-0.283970,0.0,-0.106004,-1.323916,0.496213,0.679470,0
297,-0.475769,0.0,0.000000,-0.440301,0.0,0.0,M,0.0,0.0,0.0,-0.309431,-0.470112,0.347723,-0.191640,0.0,-0.214125,-0.474093,-0.594065,0.359936,0
298,-0.833178,0.0,0.037037,-0.408678,0.0,0.0,F,0.0,0.0,0.0,2.003404,1.171861,-2.928419,-0.292447,0.0,-0.922037,-0.832054,1.104250,-2.856561,0


## 4. Convert gender feature from categorical to numerical (male-1, female-0)

In [18]:
# Replace "M" with 1 and "F" with 0 in the "gender" column
fraud_df['gender'] = fraud_df['gender'].replace({'M': 1, 'F': 0})

# Check it worked
fraud_df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,-0.439028,0.0,0.0,-0.439041,0.0,0.0,1,0.0,0.0,0.0,-0.460523,-2.487956,0.506891,-0.278382,0.0,0.214474,-0.437295,-2.462929,0.494765,0
1,-1.550205,0.0,0.037037,0.033496,0.0,0.0,1,0.0,0.0,0.0,-1.726776,0.904109,1.265039,-0.287463,0.0,-0.718534,-1.554888,0.965477,1.330373,0
2,-1.031801,0.0,0.0,-0.219544,0.0,0.0,0,0.0,0.0,0.0,1.271542,-2.510754,-0.87459,-0.220967,0.0,1.017458,-1.030985,-2.270476,-0.811065,0
3,-1.021952,0.0,0.0,-0.460582,0.0,0.0,1,0.0,0.0,0.0,-0.635212,-1.150425,0.692323,-0.211566,0.0,1.390443,-1.02112,-1.066539,0.655249,0
4,-0.10076,0.0,0.0,6.222258,0.0,0.0,1,0.0,0.0,0.0,1.42407,1.006247,-1.68284,-0.286907,0.0,0.18356,-0.098504,1.155817,-1.6509,0


In [19]:
# Drop is_fraud column
fraud_df.drop(['is_fraud'], axis=1, inplace=True)
fraud_df

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long
0,-0.439028,0.0,0.000000,-0.439041,0.0,0.0,1,0.0,0.0,0.0,-0.460523,-2.487956,0.506891,-0.278382,0.0,0.214474,-0.437295,-2.462929,0.494765
1,-1.550205,0.0,0.037037,0.033496,0.0,0.0,1,0.0,0.0,0.0,-1.726776,0.904109,1.265039,-0.287463,0.0,-0.718534,-1.554888,0.965477,1.330373
2,-1.031801,0.0,0.000000,-0.219544,0.0,0.0,0,0.0,0.0,0.0,1.271542,-2.510754,-0.874590,-0.220967,0.0,1.017458,-1.030985,-2.270476,-0.811065
3,-1.021952,0.0,0.000000,-0.460582,0.0,0.0,1,0.0,0.0,0.0,-0.635212,-1.150425,0.692323,-0.211566,0.0,1.390443,-1.021120,-1.066539,0.655249
4,-0.100760,0.0,0.000000,6.222258,0.0,0.0,1,0.0,0.0,0.0,1.424070,1.006247,-1.682840,-0.286907,0.0,0.183560,-0.098504,1.155817,-1.650900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.855272,0.0,0.000000,0.328959,0.0,0.0,0,0.0,0.0,0.0,-1.178801,0.361267,0.695297,-0.292751,0.0,-1.536275,0.854308,0.154477,0.631290
296,-1.324280,0.0,0.000000,-0.455422,0.0,0.0,0,0.0,0.0,0.0,-1.178452,0.345276,0.688426,-0.283970,0.0,-0.106004,-1.323916,0.496213,0.679470
297,-0.475769,0.0,0.000000,-0.440301,0.0,0.0,1,0.0,0.0,0.0,-0.309431,-0.470112,0.347723,-0.191640,0.0,-0.214125,-0.474093,-0.594065,0.359936
298,-0.833178,0.0,0.037037,-0.408678,0.0,0.0,0,0.0,0.0,0.0,2.003404,1.171861,-2.928419,-0.292447,0.0,-0.922037,-0.832054,1.104250,-2.856561


In [20]:
# Save the fraud_df to CSV file
file_path = "sample1_encoded.csv"

# Use the to_csv method to export the DataFrame to a CSV file
fraud_df.to_csv(file_path, index=False)