In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./Dataset/preprocessed_dataset_v2.csv", index_col=0)
df.head()


Unnamed: 0,Year,Month,Day,Hour,Minute,Payment Method,Amount,Category,Is Fraud?
0,2002,9,1,6,21,on-site,134.09,Personal Spending,No
1,2002,9,1,6,42,on-site,38.48,Food/Groceries,No
2,2002,9,2,6,22,on-site,120.34,Food/Groceries,No
3,2002,9,2,17,45,on-site,128.95,Clothing,No
4,2002,9,3,6,23,on-site,104.71,Healthcare,No


In [3]:
df.dtypes

Year                int64
Month               int64
Day                 int64
Hour                int64
Minute              int64
Payment Method     object
Amount            float64
Category           object
Is Fraud?          object
dtype: object

In [4]:
# convert all int64 to int32 and float64 to float 32
import pandas as pd

# Assuming df is your DataFrame
df = df.astype({col: 'int32' for col in df.select_dtypes(include='int64').columns})
df = df.astype({col: 'float32' for col in df.select_dtypes(include='float64').columns})


In [5]:
df.dtypes

Year                int32
Month               int32
Day                 int32
Hour                int32
Minute              int32
Payment Method     object
Amount            float32
Category           object
Is Fraud?          object
dtype: object

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Prepare the df
X = df.drop(columns=['Is Fraud?'])
y = df['Is Fraud?'].apply(lambda x: 1 if x == 'Yes' else 0)

# Identify categorical columns for one-hot encoding
categorical_features = ['Payment Method', 'Category']  # Add any other necessary columns here

# Create a column transformer with one-hot encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

# Create a pipeline with the preprocessor and standard scaler
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

# Split the df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess and standardize the df
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

# Convert to sparse matrices to save memory
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Train and evaluate models
best_model = None
best_accuracy = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model_name

print(f"Best Model: {best_model} with accuracy of {best_accuracy}")


Logistic Regression Accuracy: 0.9987876300344397
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   4793874
           1       0.29      0.00      0.00      5816

    accuracy                           1.00   4799690
   macro avg       0.64      0.50      0.50   4799690
weighted avg       1.00      1.00      1.00   4799690

