In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset
train_data_path = "D:/MachineLearning/CreditCardFraudDetection/data/fraudTrain.csv"  
df = pd.read_csv(train_data_path)

print(df.head())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [2]:
print("dataset shape:",df.shape)
print("Columns:",df.columns)

dataset shape: (1296675, 23)
Columns: Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [3]:
# Check for missing values
print("Missing Values:\n", df.isnull().sum())


Missing Values:
 Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64


In [4]:
# Convert 'trans_date_trans_time' (timestamp) to numeric Unix time
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Convert to Unix time (in seconds)
df['trans_date_trans_time'] = df['trans_date_trans_time'].values.astype('int64') // 10**9

# Convert categorical columns (e.g., 'merchant', 'category', 'gender') to numeric using Label Encoding
from sklearn.preprocessing import LabelEncoder

# List of categorical columns
categorical_columns = ['merchant', 'category', 'gender', 'street', 'city', 'state', 'job', 'zip']

# Apply Label Encoding to each categorical column
label_encoder = LabelEncoder()
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

# Now, check the first few rows of the dataset after encoding
print(df.head())


   Unnamed: 0  trans_date_trans_time            cc_num  merchant  category  \
0           0             1546300818  2703186189652095       514         8   
1           1             1546300844      630423337322       241         4   
2           2             1546300851    38859492057661       390         0   
3           3             1546300876  3534093764340240       360         2   
4           4             1546300986   375534208663984       297         9   

      amt      first     last  gender  street  ...      lat      long  \
0    4.97   Jennifer    Banks       0     568  ...  36.0788  -81.1781   
1  107.23  Stephanie     Gill       0     435  ...  48.8878 -118.2105   
2  220.11     Edward  Sanchez       1     602  ...  42.1808 -112.2620   
3   45.00     Jeremy    White       1     930  ...  46.2306 -112.1138   
4   41.96      Tyler   Garcia       1     418  ...  38.4207  -79.4629   

   city_pop  job         dob                         trans_num   unix_time  \
0      3495  3

In [5]:
# Check columns with non-numeric values
non_numeric_columns = df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['first', 'last', 'dob', 'trans_num'], dtype='object')


In [6]:
# Import the required library for Label Encoding
from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding to each categorical column
label_encoder = LabelEncoder()

# Loop through each non-numeric column and apply Label Encoding
for column in non_numeric_columns:
    df[column] = label_encoder.fit_transform(df[column])

# Check the first few rows after encoding
print(df.head())

   Unnamed: 0  trans_date_trans_time            cc_num  merchant  category  \
0           0             1546300818  2703186189652095       514         8   
1           1             1546300844      630423337322       241         4   
2           2             1546300851    38859492057661       390         0   
3           3             1546300876  3534093764340240       360         2   
4           4             1546300986   375534208663984       297         9   

      amt  first  last  gender  street  ...      lat      long  city_pop  job  \
0    4.97    162    18       0     568  ...  36.0788  -81.1781      3495  370   
1  107.23    309   157       0     435  ...  48.8878 -118.2105       149  428   
2  220.11    115   381       1     602  ...  42.1808 -112.2620      4154  307   
3   45.00    163   463       1     930  ...  46.2306 -112.1138      1939  328   
4   41.96    336   149       1     418  ...  38.4207  -79.4629        99  116   

   dob  trans_num   unix_time  merch_lat  me

In [7]:
# Import the required library for SMOTE
from imblearn.over_sampling import SMOTE

# Separate features and target
X = df.drop('is_fraud', axis=1)  # Features (excluding 'is_fraud')
y = df['is_fraud']  # Target (the 'is_fraud' column)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check the new class distribution
print("Class distribution after SMOTE:\n", y_res.value_counts())


Class distribution after SMOTE:
 is_fraud
0    1289169
1    1289169
Name: count, dtype: int64


In [8]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Check the shape of the split data
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (2062670, 22)
Testing set shape: (515668, 22)


In [9]:
# Import Logistic Regression and evaluation metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67    257186
           1       0.00      0.00      0.00    258482

    accuracy                           0.50    515668
   macro avg       0.25      0.50      0.33    515668
weighted avg       0.25      0.50      0.33    515668

Confusion Matrix:
 [[257186      0]
 [258482      0]]


In [10]:
# Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("Classification Report for Random Forest:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix for Random Forest:\n", confusion_matrix(y_test, y_pred_rf))

Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    257186
           1       1.00      1.00      1.00    258482

    accuracy                           1.00    515668
   macro avg       1.00      1.00      1.00    515668
weighted avg       1.00      1.00      1.00    515668

Confusion Matrix for Random Forest:
 [[256553    633]
 [    59 258423]]


In [12]:
# Import necessary libraries
import joblib

# Save the trained Random Forest model
joblib.dump(rf_model, 'fraud_detection.pkl')

['fraud_detection.pkl']