### Data Information

In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("fraudTest.csv")
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

(None,
    Unnamed: 0 trans_date_trans_time            cc_num  \
 0           0   2020-06-21 12:14:25  2291163933867244   
 1           1   2020-06-21 12:14:33  3573030041201292   
 2           2   2020-06-21 12:14:53  3598215285024754   
 3           3   2020-06-21 12:15:15  3591919803438423   
 4           4   2020-06-21 12:15:17  3526826139003047   
 
                                merchant        category    amt   first  \
 0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
 1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
 2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
 3                     fraud_Haley Group        misc_pos  60.05   Brian   
 4                 fraud_Johnston-Casper          travel   3.19  Nathan   
 
        last gender                       street  ...      lat      long  \
 0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
 1  Williams      F             3638 Ma

### Preprocessing Date

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

columns_to_drop = ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'first', 'last',
                   'street', 'city', 'state', 'zip', 'dob', 'trans_num']
df_cleaned = df.drop(columns=columns_to_drop)

categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df_cleaned[col] = le.fit_transform(df_cleaned[col])

X = df_cleaned.drop('is_fraud', axis=1)
y = df_cleaned['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()

((444575, 11),
 (111144, 11),
 is_fraud
 0    442859
 1      1716
 Name: count, dtype: int64,
 is_fraud
 0    110715
 1       429
 Name: count, dtype: int64)

### Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

report_lr = classification_report(y_test, y_pred_lr, output_dict=True)
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
report_lr, conf_matrix_lr

({'0': {'precision': 0.9989500368905957,
   'recall': 0.953863523461139,
   'f1-score': 0.9758863019964608,
   'support': 110715.0},
  '1': {'precision': 0.0586067084408404,
   'recall': 0.7412587412587412,
   'f1-score': 0.10862510674637062,
   'support': 429.0},
  'accuracy': 0.9530428993018066,
  'macro avg': {'precision': 0.528778372665718,
   'recall': 0.8475611323599401,
   'f1-score': 0.5422557043714157,
   'support': 111144.0},
  'weighted avg': {'precision': 0.9953204456584558,
   'recall': 0.9530428993018066,
   'f1-score': 0.9725387973829657,
   'support': 111144.0}},
 array([[105607,   5108],
        [   111,    318]]))

### Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train the Decision Tree model
dt = DecisionTreeClassifier(class_weight='balanced', random_state=42)
dt.fit(X_train, y_train)

# Predict and evaluate
y_pred_dt = dt.predict(X_test)
print("Decision Tree:\n", classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

Decision Tree:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    110715
           1       0.67      0.62      0.64       429

    accuracy                           1.00    111144
   macro avg       0.84      0.81      0.82    111144
weighted avg       1.00      1.00      1.00    111144

Confusion Matrix:
 [[110586    129]
 [   165    264]]
