# Importing the libraries

In [1]:
import tensorflow as tf
import pandas as pd

In [2]:
print(tf.__version__)

2.14.0


# Importing the dataset

In [3]:
# Dataset with all columns
dataset = pd.read_csv("Dataset/credit_card.csv")
dataset

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:44,"Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,49.159047,-118.186462,0
1,2019-01-01 00:00:51,Lind-Buckridge,entertainment,220.11,Malad City,ID,42.1808,-112.2620,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,43.150704,-112.154481,0
2,2019-01-01 00:07:27,Kiehn Inc,grocery_pos,96.29,Grenada,CA,41.6125,-122.5258,589,Systems analyst,1945-12-21,413636e759663f264aae1819a4d4f231,41.657520,-122.230347,0
3,2019-01-01 00:09:03,Beier-Hyatt,shopping_pos,7.77,High Rolls Mountain Park,NM,32.9396,-105.8189,899,Naval architect,1967-08-30,8a6293af5ed278dea14448ded2685fea,32.863258,-106.520205,0
4,2019-01-01 00:21:32,Bruen-Yost,misc_pos,6.85,Freedom,WY,43.0172,-111.0292,471,"Education officer, museum",1967-08-02,f3c43d336e92a44fc2fb67058d5949e3,43.753735,-111.454923,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339602,2020-12-31 23:57:56,Schmidt-Larkin,home,12.68,Wales,AK,64.7556,-165.6723,145,"Administrator, education",1939-11-09,a8310343c189e4a5b6316050d2d6b014,65.623593,-165.186033,0
339603,2020-12-31 23:58:04,"Pouros, Walker and Spence",kids_pets,13.02,Greenview,CA,41.5403,-122.9366,308,Call centre manager,1958-09-20,bd7071fd5c9510a5594ee196368ac80e,41.973127,-123.553032,0
339604,2020-12-31 23:59:07,Reilly and Sons,health_fitness,43.77,Luray,MO,40.4931,-91.8912,519,Town planner,1966-02-13,9b1f753c79894c9f4b71f04581835ada,39.946837,-91.333331,0
339605,2020-12-31 23:59:15,Rau-Robel,kids_pets,86.88,Burbank,WA,46.1966,-118.9017,3684,Musician,1981-11-29,6c5b7c8add471975aa0fec023b2e8408,46.658340,-119.715054,0


# Data Exploration and Understanding

In [4]:
# Explore the data types of the columns
dataset.dtypes

trans_date_trans_time     object
merchant                  object
category                  object
amt                      float64
city                      object
state                     object
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [5]:
dataset.describe()

Unnamed: 0,amt,lat,long,city_pop,merch_lat,merch_long,is_fraud
count,339607.0,339607.0,339607.0,339607.0,339607.0,339607.0,339607.0
mean,70.577984,39.718991,-110.622605,107140.9,39.718853,-110.622383,0.005247
std,161.675242,5.094961,12.65137,293029.9,5.130894,12.663998,0.072248
min,1.0,20.0271,-165.6723,46.0,19.027422,-166.671575,0.0
25%,9.6,36.7154,-120.0936,471.0,36.817194,-119.823755,0.0
50%,46.46,39.6171,-111.0985,1645.0,39.586209,-111.036443,0.0
75%,83.35,41.71,-100.6215,35439.0,42.193072,-100.353096,0.0
max,28948.9,66.6933,-89.6287,2383912.0,67.510267,-88.629203,1.0


In [6]:
#To gets the columns that are most correlated with the class column
corr = dataset.corr(numeric_only=True)['is_fraud'].sort_values
corr


<bound method Series.sort_values of amt           0.201023
lat           0.009239
long         -0.004494
city_pop      0.002338
merch_lat     0.008848
merch_long   -0.004550
is_fraud      1.000000
Name: is_fraud, dtype: float64>

The 'amt' feature appears to have a positive correlation with 'is_fraud,' suggesting that higher transaction amounts might be associated with a higher likelihood of fraud.

The 'city_pop' feature has a very weak correlation with 'is_fraud,' suggesting it may not be a strong predictor.

In [7]:
from scipy.stats import chi2_contingency

# Select the categorical columns with 'object' data type
categorical_columns = dataset.select_dtypes(include=['object']).columns

# Create a dictionary to store chi-square statistics and p-values
chi2_results = {}

# Perform Chi-Square test for each categorical column
for col in categorical_columns:
    contingency_table = pd.crosstab(dataset[col], dataset['is_fraud'])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    chi2_results[col] = {'Chi-Square': chi2, 'p-value': p}

# Create a DataFrame from the results
chi2_results_df = pd.DataFrame(chi2_results).T

# Sort by p-value (smaller p-values indicate stronger association)
chi2_results_df.sort_values(by='p-value', ascending=True, inplace=True)

# Print the results
print(chi2_results_df)

                          Chi-Square        p-value
city                    31570.202091   0.000000e+00
job                     29197.252678   0.000000e+00
dob                     37423.425681   0.000000e+00
category                 1417.008486  3.323232e-295
merchant                 2144.122007  4.166961e-148
state                     130.571299   4.739550e-22
trans_date_trans_time  338808.743360   3.548401e-01
trans_num              339607.000000   4.991932e-01


Columns like 'city,' 'job,' 'dob,' 'category,' 'state,' and 'merchant' have very low p-values close to zero, indicating a strong association with the target variable ('isFraud').

The 'trans_date_trans_time' column has a p-value of approximately 0.354, indicating that it may not be strongly associated with the target variable.

The 'trans_num' column has a p-value of approximately 0.499, indicating no significant association with the target variable.

In [8]:
# To explore a particular column
dataset['merchant'].describe()

count          339607
unique            693
top       Kilback LLC
freq             1149
Name: merchant, dtype: object

In [9]:
# To explore a particular column
dataset['category'].describe()

count            339607
unique               14
top       gas_transport
freq              35089
Name: category, dtype: object

In [10]:
# To explore a particular column
dataset['city'].describe()

count      339607
unique        176
top       Phoenix
freq         7297
Name: city, dtype: object

In [11]:
# To explore a particular column
dataset['state'].describe()

count     339607
unique        13
top           CA
freq       80495
Name: state, dtype: object

In [12]:
# To explore a particular column
dataset['job'].describe()

count                 339607
unique                   163
top       Surveyor, minerals
freq                    6589
Name: job, dtype: object

In [13]:
# It returns the unique values in the column as an array.
pd.unique(dataset['category'].values)

array(['grocery_pos', 'entertainment', 'shopping_pos', 'misc_pos',
       'shopping_net', 'gas_transport', 'misc_net', 'grocery_net',
       'food_dining', 'health_fitness', 'kids_pets', 'home',
       'personal_care', 'travel'], dtype=object)

# Data Cleaning

### (1) Removing irrelevant columns

In [14]:
# List of columns we want to keep
desired_columns = ['trans_date_trans_time', 'merchant', 'category', 'amt', 'state', 'city', 'job', 'dob', 'is_fraud']

# Filter the DataFrame to keep only the desired columns
filtered = dataset[desired_columns]

# 'filtered' now contains only the specified columns, and the rest are removed
filtered

Unnamed: 0,trans_date_trans_time,merchant,category,amt,state,city,job,dob,is_fraud
0,2019-01-01 00:00:44,"Heller, Gutmann and Zieme",grocery_pos,107.23,WA,Orient,Special educational needs teacher,1978-06-21,0
1,2019-01-01 00:00:51,Lind-Buckridge,entertainment,220.11,ID,Malad City,Nature conservation officer,1962-01-19,0
2,2019-01-01 00:07:27,Kiehn Inc,grocery_pos,96.29,CA,Grenada,Systems analyst,1945-12-21,0
3,2019-01-01 00:09:03,Beier-Hyatt,shopping_pos,7.77,NM,High Rolls Mountain Park,Naval architect,1967-08-30,0
4,2019-01-01 00:21:32,Bruen-Yost,misc_pos,6.85,WY,Freedom,"Education officer, museum",1967-08-02,0
...,...,...,...,...,...,...,...,...,...
339602,2020-12-31 23:57:56,Schmidt-Larkin,home,12.68,AK,Wales,"Administrator, education",1939-11-09,0
339603,2020-12-31 23:58:04,"Pouros, Walker and Spence",kids_pets,13.02,CA,Greenview,Call centre manager,1958-09-20,0
339604,2020-12-31 23:59:07,Reilly and Sons,health_fitness,43.77,MO,Luray,Town planner,1966-02-13,0
339605,2020-12-31 23:59:15,Rau-Robel,kids_pets,86.88,WA,Burbank,Musician,1981-11-29,0


In [15]:
# filtered = dataset.drop(['city_pop',lat', 'long', 'trans_num', 'merch_long', 'merch_lat'], axis=1)
# filtered

### (2) Handling missing data

In [16]:
# Use the “info()” function to have an idea about null columns.
filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339607 entries, 0 to 339606
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trans_date_trans_time  339607 non-null  object 
 1   merchant               339607 non-null  object 
 2   category               339607 non-null  object 
 3   amt                    339607 non-null  float64
 4   state                  339607 non-null  object 
 5   city                   339607 non-null  object 
 6   job                    339607 non-null  object 
 7   dob                    339607 non-null  object 
 8   is_fraud               339607 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 23.3+ MB


In [17]:
# Check the null values count in the filtered dataset.
filtered.isnull().sum()

trans_date_trans_time    0
merchant                 0
category                 0
amt                      0
state                    0
city                     0
job                      0
dob                      0
is_fraud                 0
dtype: int64

### (3) Handling duplicate data 

In [18]:
# Check for duplicate rows
duplicate_rows = filtered[filtered.duplicated()]

# Print the duplicate rows
print("Duplicate Rows:")
print(duplicate_rows)

Duplicate Rows:
Empty DataFrame
Columns: [trans_date_trans_time, merchant, category, amt, state, city, job, dob, is_fraud]
Index: []


In [19]:
# Handle duplicates
filtered.drop_duplicates(inplace=False)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,state,city,job,dob,is_fraud
0,2019-01-01 00:00:44,"Heller, Gutmann and Zieme",grocery_pos,107.23,WA,Orient,Special educational needs teacher,1978-06-21,0
1,2019-01-01 00:00:51,Lind-Buckridge,entertainment,220.11,ID,Malad City,Nature conservation officer,1962-01-19,0
2,2019-01-01 00:07:27,Kiehn Inc,grocery_pos,96.29,CA,Grenada,Systems analyst,1945-12-21,0
3,2019-01-01 00:09:03,Beier-Hyatt,shopping_pos,7.77,NM,High Rolls Mountain Park,Naval architect,1967-08-30,0
4,2019-01-01 00:21:32,Bruen-Yost,misc_pos,6.85,WY,Freedom,"Education officer, museum",1967-08-02,0
...,...,...,...,...,...,...,...,...,...
339602,2020-12-31 23:57:56,Schmidt-Larkin,home,12.68,AK,Wales,"Administrator, education",1939-11-09,0
339603,2020-12-31 23:58:04,"Pouros, Walker and Spence",kids_pets,13.02,CA,Greenview,Call centre manager,1958-09-20,0
339604,2020-12-31 23:59:07,Reilly and Sons,health_fitness,43.77,MO,Luray,Town planner,1966-02-13,0
339605,2020-12-31 23:59:15,Rau-Robel,kids_pets,86.88,WA,Burbank,Musician,1981-11-29,0


# Feature Engineering

### (1) Extracting date and time from 'trans_date_trans_time'

In [20]:
# Convert the 'trans_date_trans_time_column' to a datetime data type
filtered = filtered.copy()
filtered['trans_date_trans_time'] = pd.to_datetime(filtered['trans_date_trans_time'])

In [21]:
# Split the datetime column into separate date and time columns
filtered['trans_date'] = filtered['trans_date_trans_time'].dt.date
filtered['trans_time'] = filtered['trans_date_trans_time'].dt.time

In [22]:
# Print the resulting DataFrame
print(filtered)

       trans_date_trans_time                   merchant        category  \
0        2019-01-01 00:00:44  Heller, Gutmann and Zieme     grocery_pos   
1        2019-01-01 00:00:51             Lind-Buckridge   entertainment   
2        2019-01-01 00:07:27                  Kiehn Inc     grocery_pos   
3        2019-01-01 00:09:03                Beier-Hyatt    shopping_pos   
4        2019-01-01 00:21:32                 Bruen-Yost        misc_pos   
...                      ...                        ...             ...   
339602   2020-12-31 23:57:56             Schmidt-Larkin            home   
339603   2020-12-31 23:58:04  Pouros, Walker and Spence       kids_pets   
339604   2020-12-31 23:59:07            Reilly and Sons  health_fitness   
339605   2020-12-31 23:59:15                  Rau-Robel       kids_pets   
339606   2020-12-31 23:59:24            Breitenberg LLC          travel   

           amt state                      city  \
0       107.23    WA                    Orient   

In [23]:
# List of columns we want to keep
desired_columns = ['trans_date', 'trans_time', 'amt', 'merchant', 'category', 'job', 'dob', 'state', 'city', 'is_fraud']

# Filter the DataFrame to keep only the desired columns
filtered = filtered[desired_columns]

# 'filtered_df' now contains only the specified columns, and the rest are removed
filtered

Unnamed: 0,trans_date,trans_time,amt,merchant,category,job,dob,state,city,is_fraud
0,2019-01-01,00:00:44,107.23,"Heller, Gutmann and Zieme",grocery_pos,Special educational needs teacher,1978-06-21,WA,Orient,0
1,2019-01-01,00:00:51,220.11,Lind-Buckridge,entertainment,Nature conservation officer,1962-01-19,ID,Malad City,0
2,2019-01-01,00:07:27,96.29,Kiehn Inc,grocery_pos,Systems analyst,1945-12-21,CA,Grenada,0
3,2019-01-01,00:09:03,7.77,Beier-Hyatt,shopping_pos,Naval architect,1967-08-30,NM,High Rolls Mountain Park,0
4,2019-01-01,00:21:32,6.85,Bruen-Yost,misc_pos,"Education officer, museum",1967-08-02,WY,Freedom,0
...,...,...,...,...,...,...,...,...,...,...
339602,2020-12-31,23:57:56,12.68,Schmidt-Larkin,home,"Administrator, education",1939-11-09,AK,Wales,0
339603,2020-12-31,23:58:04,13.02,"Pouros, Walker and Spence",kids_pets,Call centre manager,1958-09-20,CA,Greenview,0
339604,2020-12-31,23:59:07,43.77,Reilly and Sons,health_fitness,Town planner,1966-02-13,MO,Luray,0
339605,2020-12-31,23:59:15,86.88,Rau-Robel,kids_pets,Musician,1981-11-29,WA,Burbank,0


### (2)  Encode categorical variables

In [24]:
# Encode categorical variables (using one-hot encoding)
data = pd.get_dummies(filtered, columns=['city', 'state', 'job', 'merchant', 'category'])
data

Unnamed: 0,trans_date,trans_time,amt,dob,is_fraud,city_Albuquerque,city_Altonah,city_Alva,city_American Fork,city_Angwin,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,2019-01-01,00:00:44,107.23,1978-06-21,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2019-01-01,00:00:51,220.11,1962-01-19,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2019-01-01,00:07:27,96.29,1945-12-21,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2019-01-01,00:09:03,7.77,1967-08-30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2019-01-01,00:21:32,6.85,1967-08-02,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339602,2020-12-31,23:57:56,12.68,1939-11-09,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
339603,2020-12-31,23:58:04,13.02,1958-09-20,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
339604,2020-12-31,23:59:07,43.77,1966-02-13,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
339605,2020-12-31,23:59:15,86.88,1981-11-29,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


# Formatting data

In [25]:
# Create a dictionary with the mapping of old column names to new column names
column_mapping = {'amt': 'amount'}

# Use the 'rename()' method to rename the columns
data.rename(columns=column_mapping, inplace=True)

# 'data' now has the columns with the new names
data

Unnamed: 0,trans_date,trans_time,amount,dob,is_fraud,city_Albuquerque,city_Altonah,city_Alva,city_American Fork,city_Angwin,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,2019-01-01,00:00:44,107.23,1978-06-21,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2019-01-01,00:00:51,220.11,1962-01-19,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2019-01-01,00:07:27,96.29,1945-12-21,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,2019-01-01,00:09:03,7.77,1967-08-30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,2019-01-01,00:21:32,6.85,1967-08-02,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339602,2020-12-31,23:57:56,12.68,1939-11-09,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
339603,2020-12-31,23:58:04,13.02,1958-09-20,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
339604,2020-12-31,23:59:07,43.77,1966-02-13,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
339605,2020-12-31,23:59:15,86.88,1981-11-29,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [26]:
data.dtypes

trans_date                 object
trans_time                 object
amount                    float64
dob                        object
is_fraud                    int64
                           ...   
category_misc_pos           uint8
category_personal_care      uint8
category_shopping_net       uint8
category_shopping_pos       uint8
category_travel             uint8
Length: 1064, dtype: object

In [27]:
#Save to a csv
#data.to_csv("cleaned_dataset.csv",index=False)

# Handling Class Imbalance (using oversampling)

In [None]:
from imblearn.over_sampling import RandomOverSampler
X = data.drop(columns=['is_fraud'])
y = data['is_fraud']

oversampler = RandomOverSampler(sampling_strategy='minority')
X_resampled, y_resampled = oversampler.fit_resample(X, y)