## Import dependencies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Import the data and perform EDA

In [3]:
train_df = pd.read_csv("fraudTrain.csv", sep=",")
test_df = pd.read_csv("fraudTest.csv", sep=",")

Below shows that there are 1,296,675 rows and 23 columns in the training set and 555,719 rows and 23 columns in the testing set.

In [4]:
print(train_df.shape)
print(test_df.shape)

(1296675, 23)
(555719, 23)


In [5]:
# This tests if the column names are the same in both sets because sometimes the testing file does not contain a target variable.
train_df.columns.equals(test_df.columns)

True

In [6]:
train_df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [7]:
test_df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [8]:
# Now we will do the individual X and y from both the training and testing set.

cols_to_drop = ["merch_long", "merch_lat", "Unnamed: 0", "trans_num"]

train_df = train_df.drop(columns = cols_to_drop)
test_df = test_df.drop(columns = cols_to_drop)

X_train = train_df.drop("is_fraud", axis=1)
y_train = train_df["is_fraud"]

X_test = test_df.drop("is_fraud", axis=1)
y_test = test_df["is_fraud"]

In [9]:
train_df.isnull().any().sum()

0

In [10]:
train_df.duplicated().any()

False

In [11]:
train_df

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,1325376018,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1325376044,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.2620,4154,Nature conservation officer,1962-01-19,1325376051,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,1325376076,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,1325376186,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,Hatch,UT,84735,37.7175,-112.4777,258,Geoscientist,1961-11-24,1371816728,0
1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.70,Jeffrey,White,M,8617 Holmes Terrace Suite 651,Tuscarora,MD,21790,39.2667,-77.5101,100,"Production assistant, television",1979-12-11,1371816739,0
1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,High Rolls Mountain Park,NM,88325,32.9396,-105.8189,899,Naval architect,1967-08-30,1371816752,0
1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,Joseph,Murray,M,42933 Ryan Underpass,Manderson,SD,57756,43.3526,-102.5411,1126,Volunteer coordinator,1980-08-18,1371816816,0


In [12]:
# Identify categorical and numeric features
categorical_features = train_df.select_dtypes(exclude=["number"]).columns.tolist()
numeric_features = train_df.select_dtypes(include=["number"]).columns.tolist()

print("Categroical Features:", categorical_features)
print()
print("Numerical Features:", numeric_features)

Categroical Features: ['trans_date_trans_time', 'merchant', 'category', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob']

Numerical Features: ['cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop', 'unix_time', 'is_fraud']


In [13]:
## Turn the categorical variables into numerical features 

# use HW 4 Part D (I think)
from sklearn.preprocessing import OrdinalEncoder

cat_cols = X_train.select_dtypes(exclude=["number"]).columns
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols])

In [14]:
X_train

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time
0,0.0,2703186189652095,514.0,8.0,4.97,162.0,18.0,0.0,568.0,526.0,27.0,28654,36.0788,-81.1781,3495,370.0,779.0,1325376018
1,1.0,630423337322,241.0,4.0,107.23,309.0,157.0,0.0,435.0,612.0,47.0,99160,48.8878,-118.2105,149,428.0,607.0,1325376044
2,2.0,38859492057661,390.0,0.0,220.11,115.0,381.0,1.0,602.0,468.0,13.0,83252,42.1808,-112.2620,4154,307.0,302.0,1325376051
3,3.0,3534093764340240,360.0,2.0,45.00,163.0,463.0,1.0,930.0,84.0,26.0,59632,46.2306,-112.1138,1939,328.0,397.0,1325376076
4,4.0,375534208663984,297.0,9.0,41.96,336.0,149.0,1.0,418.0,216.0,45.0,24433,38.4207,-79.4629,99,116.0,734.0,1325376186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1274786.0,30263540414123,499.0,0.0,15.56,121.0,332.0,1.0,154.0,330.0,44.0,84735,37.7175,-112.4777,258,215.0,298.0,1371816728
1296671,1274787.0,6011149206456997,2.0,1.0,51.70,160.0,463.0,1.0,856.0,813.0,20.0,21790,39.2667,-77.5101,100,360.0,630.0,1371816739
1296672,1274788.0,3514865930894695,599.0,1.0,105.93,74.0,67.0,1.0,158.0,346.0,32.0,88325,32.9396,-105.8189,899,308.0,412.0,1371816752
1296673,1274789.0,2720012583106919,509.0,1.0,74.90,179.0,304.0,1.0,433.0,471.0,41.0,57756,43.3526,-102.5411,1126,485.0,639.0,1371816816


In [15]:
X_train.describe()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,unix_time
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,637735.0,4.17192e+17,342.8585,6.227787,70.35104,180.2624,250.5209,0.4525513,488.0344,445.2633,26.67728,48800.67,38.53762,-90.22634,88824.44,251.1734,534.2329,1349244000.0
std,367716.4,1.308806e+18,200.9519,3.913443,160.316,97.53379,136.6741,0.4977437,280.0608,258.6001,14.33098,26893.22,5.075808,13.75908,301956.4,140.1094,267.4801,12841280.0
min,0.0,60416210000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1257.0,20.0271,-165.6723,23.0,0.0,0.0,1325376000.0
25%,319473.5,180042900000000.0,165.0,3.0,9.65,94.0,138.0,0.0,252.0,224.0,15.0,26237.0,34.6205,-96.798,743.0,131.0,317.0,1338751000.0
50%,638093.0,3521417000000000.0,346.0,6.0,47.52,183.0,252.0,0.0,485.0,439.0,28.0,48174.0,39.3543,-87.4769,2456.0,251.0,565.0,1349250000.0
75%,955484.5,4642255000000000.0,514.0,10.0,83.14,257.0,370.0,1.0,720.0,677.0,38.0,72042.0,41.9404,-80.158,20328.0,374.0,756.0,1359385000.0
max,1274790.0,4.992346e+18,692.0,13.0,28948.9,351.0,480.0,1.0,982.0,893.0,50.0,99783.0,66.6933,-67.9503,2906700.0,493.0,967.0,1371817000.0


In [16]:
X_train["unix_time"]

0          1325376018
1          1325376044
2          1325376051
3          1325376076
4          1325376186
              ...    
1296670    1371816728
1296671    1371816739
1296672    1371816752
1296673    1371816816
1296674    1371816817
Name: unix_time, Length: 1296675, dtype: int64

# EDA on new data

In [17]:
df = pd.read_csv("fraudTrain.csv", sep=",")
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [19]:
df.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [20]:
df.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [22]:
df.isnull().any()

Unnamed: 0               False
trans_date_trans_time    False
cc_num                   False
merchant                 False
category                 False
amt                      False
first                    False
last                     False
gender                   False
street                   False
city                     False
state                    False
zip                      False
lat                      False
long                     False
city_pop                 False
job                      False
dob                      False
trans_num                False
unix_time                False
merch_lat                False
merch_long               False
is_fraud                 False
dtype: bool

The two above cells show that there are not any null values in the data.

In [21]:
# Shows the percentage of fraud vs non-fraud
df["is_fraud"].value_counts(normalize=True) 

is_fraud
0    0.994211
1    0.005789
Name: proportion, dtype: float64

The above cell shows that 99% of the data is classified as fraud and 1% of the data is classified as fraud.