## Data Preparation

Step 1. Load the data.

In [150]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import kagglehub
import findspark

import pyspark
from pyspark.sql import SparkSession


In [151]:
# Define path to Kagglehub dataset

path = kagglehub.dataset_download("kartik2112/fraud-detection")

In [107]:
fraud_test_csv = r"C:\Users\Jean\.cache\kagglehub\datasets\kartik2112\fraud-detection\versions\1\fraudTest.csv"
fraud_train_csv = r"C:\Users\Jean\.cache\kagglehub\datasets\kartik2112\fraud-detection\versions\1\fraudTrain.csv"

In [108]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
fraud_test_df_complete = pd.read_csv(fraud_test_csv)
fraud_train_df_complete = pd.read_csv(fraud_train_csv)


In [87]:
# Check the fraud_test_df_complete information

fraud_test_df_complete.info()
fraud_test_df_complete.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,277859.0,4.178387e+17,69.39281,48842.628015,38.543253,-90.231325,88221.89,1380679000.0,38.542798,-90.23138,0.00386
std,160422.401459,1.309837e+18,156.745941,26855.283328,5.061336,13.72178,300390.9,5201104.0,5.095829,13.733071,0.062008
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1371817000.0,19.027422,-166.671575,0.0
25%,138929.5,180042900000000.0,9.63,26292.0,34.6689,-96.798,741.0,1376029000.0,34.755302,-96.905129,0.0
50%,277859.0,3521417000000000.0,47.29,48174.0,39.3716,-87.4769,2408.0,1380762000.0,39.376593,-87.445204,0.0
75%,416788.5,4635331000000000.0,83.01,72011.0,41.8948,-80.1752,19685.0,1385867000.0,41.954163,-80.264637,0.0
max,555718.0,4.992346e+18,22768.11,99921.0,65.6899,-67.9503,2906700.0,1388534000.0,66.679297,-66.952026,1.0


In [109]:
# Check the fraud_train_df_complete information

fraud_train_df_complete.info()
fraud_train_df_complete.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [135]:
# Drop unwanted columns (also dropped merchant)
# fraud_test_df = fraud_test_df_complete[["trans_date_trans_time", "amt", "zip", "merch_lat", "merch_long", "is_fraud"]]
fraud_train_df = fraud_train_df_complete[["amt", "is_fraud"]]
# fraud_train_df = fraud_train_df_complete[["trans_date_trans_time", "amt", "zip", "merch_lat", "merch_long", "is_fraud"]]

In [130]:
# Check the fraud_test_df information

fraud_test_df.info()
fraud_test_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   amt       555719 non-null  float64
 1   is_fraud  555719 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 8.5 MB


Unnamed: 0,amt,is_fraud
count,555719.0,555719.0
mean,69.39281,0.00386
std,156.745941,0.062008
min,1.0,0.0
25%,9.63,0.0
50%,47.29,0.0
75%,83.01,0.0
max,22768.11,1.0


In [126]:
# Create time-based features
# fraud_train_df['trans_date_trans_time'] = pd.to_datetime(fraud_train_df['trans_date_trans_time'])
fraud_test_df['trans_date_trans_time'] = pd.to_datetime(fraud_test_df['trans_date_trans_time'])

# Extract useful time features
def extract_time_features(df):
    # Basic time features (keeping existing ones)
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day'] = df['trans_date_trans_time'].dt.day
    df['month'] = df['trans_date_trans_time'].dt.month
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    
    # Add new time features
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # Time of day categories (0: night, 1: morning, 2: afternoon, 3: evening)
    df['time_period'] = pd.cut(df['hour'], 
                              bins=[-1, 6, 12, 18, 24], 
                              labels=[0, 1, 2, 3])
    
    # Is it business hours (9 AM to 5 PM)
    df['is_business_hours'] = df['hour'].between(9, 17).astype(int)
    
    return df

# fraud_train_df = extract_time_features(fraud_train_df)
fraud_test_df = extract_time_features(fraud_test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fraud_test_df['trans_date_trans_time'] = pd.to_datetime(fraud_test_df['trans_date_trans_time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['trans_date_trans_time'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day'] = df['trans_date_trans_time'].dt.day
A value is tr

In [131]:
# Check the fraud_test_df information

fraud_test_df.info()
fraud_test_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   amt       555719 non-null  float64
 1   is_fraud  555719 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 8.5 MB


Unnamed: 0,amt,is_fraud
count,555719.0,555719.0
mean,69.39281,0.00386
std,156.745941,0.062008
min,1.0,0.0
25%,9.63,0.0
50%,47.29,0.0
75%,83.01,0.0
max,22768.11,1.0


In [128]:
fraud_test_df.head()


Unnamed: 0,trans_date_trans_time,amt,zip,merch_lat,merch_long,is_fraud,hour,day,month,day_of_week,is_weekend,time_period,is_business_hours
0,2020-06-21 12:14:25,2.86,29209,33.986391,-81.200714,0,12,21,6,6,1,1,1
1,2020-06-21 12:14:33,29.84,84002,39.450498,-109.960431,0,12,21,6,6,1,1,1
2,2020-06-21 12:14:53,41.28,11710,40.49581,-74.196111,0,12,21,6,6,1,1,1
3,2020-06-21 12:15:15,60.05,32780,28.812398,-80.883061,0,12,21,6,6,1,1,1
4,2020-06-21 12:15:17,3.19,49632,44.959148,-85.884734,0,12,21,6,6,1,1,1


In [136]:
# Check the fraud_train_df information

fraud_train_df.info()
fraud_train_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   amt       1296675 non-null  float64
 1   is_fraud  1296675 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 19.8 MB


Unnamed: 0,amt,is_fraud
count,1296675.0,1296675.0
mean,70.35104,0.005788652
std,160.316,0.07586269
min,1.0,0.0
25%,9.65,0.0
50%,47.52,0.0
75%,83.14,0.0
max,28948.9,1.0


Step 2. Create the labels set (y) from the “is_fraud” column, and then create the features (X) DataFrame from the remaining columns. Note: the data is already split into separate train and test datasets.

In [None]:
# Separate the data into labels and features

# Separate the y variable, the labels
#y_train = fraud_train_df['is_fraud']
#y_test = fraud_test_df['is_fraud']

# Separate the X variable, the features
#X_train = fraud_train_df.drop(columns='is_fraud')
#X_test = fraud_test_df.drop(columns='is_fraud')

In [137]:
# Testing a run to only use the fraud_train_df data to create the model.
# Separate the data into labels and features

# Separate the y variable, the labels
y = fraud_train_df['is_fraud']

# Separate the X variable, the features
X = fraud_train_df.drop(columns='is_fraud')

In [138]:
# Review the y variable Series
y.head()

0    0
1    0
2    0
3    0
4    0
Name: is_fraud, dtype: int64

In [139]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,amt
0,4.97
1,107.23
2,220.11
3,45.0
4,41.96


In [141]:
# For X Identify the columns that need to be preprocessed
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

In [142]:
# Pre-process the X_train data.
# Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numerical columns
        ('cat', OneHotEncoder(), categorical_cols)   # Encode categorical columns
    ])

# Fit and transform the data
X = preprocessor.fit_transform(X)

In [143]:
# Step 3: Split the data into training and testing datasets by using train_test_split.
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [74]:
# Review the y_train variable Series
# y_train.head()

In [None]:
# Review the y_test variable Series
# y_test.head()

0    0
1    0
2    0
3    0
4    0
Name: is_fraud, dtype: int64

In [None]:
# For X_train Identify the columns that need to be preprocessed
# categorical_cols = X_train.select_dtypes(include=['object']).columns
# numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns


In [None]:
# Pre-process the X_train data.
# Create a ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_cols),  # Scale numerical columns
#         ('cat', OneHotEncoder(), categorical_cols)   # Encode categorical columns
#     ])

# Fit and transform the data
# X_train = preprocessor.fit_transform(X_train)

In [None]:
# For X_test Identify the columns that need to be preprocessed
# categorical_cols = X_test.select_dtypes(include=['object']).columns
# numerical_cols = X_test.select_dtypes(include=['int64', 'float64']).columns


In [None]:
# Pre-process the X_test data.
# Create a ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_cols),  # Scale numerical columns
#         ('cat', OneHotEncoder(), categorical_cols)   # Encode categorical columns
#     ])

# Fit and transform the data
# X_test = preprocessor.fit_transform(X_test)

In [144]:
# Review the X variable DataFrame
X_train

array([[-0.05483568],
       [-0.14297412],
       [-0.43053123],
       ...,
       [ 0.64272424],
       [ 0.34025907],
       [-0.41337761]])

In [145]:
# Review the X variable DataFrame
X_test

array([[-0.31407372],
       [ 1.33816335],
       [-0.40720231],
       ...,
       [-0.41549842],
       [-0.293427  ],
       [-0.4104459 ]])

## Create a Logistic Regression Model with the Original Data

Step 1: Fit a logistic regression model by using the training data (X_train and y_train).

In [146]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

Step 2: Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.

In [147]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

predictions_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test})

predictions_df.head()

Unnamed: 0,Prediction,Actual
94102,0,0
198791,0,0
1238587,0,0
619078,0,0
573850,0,0


Step 3: Evaluate the model’s performance by doing the following:
* Generate a confusion matrix.

* Print the classification report.

In [148]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

print("Confusion Matrix")
cm_df

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,322191,197
Actual 1,1781,0


In [149]:
# Print the classification report for the model
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    322388
           1       0.00      0.00      0.00      1781

    accuracy                           0.99    324169
   macro avg       0.50      0.50      0.50    324169
weighted avg       0.99      0.99      0.99    324169

