In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Dataset path
data_path = "/content/drive/MyDrive/credit card fraud"


In [None]:
import pandas as pd
import os

data_path = '/content/drive/MyDrive/credit card fraud'

train_df = pd.read_csv(os.path.join(data_path, 'fraudTrain.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'fraudTest.csv'))

print(f"Loaded datasets\nTrain shape: {train_df.shape}\nTest shape: {test_df.shape}\n")
print("Train columns:", train_df.columns.tolist())

print("\nTrain sample:")
print(train_df.head())


Loaded datasets
Train shape: (1296675, 23)
Test shape: (555719, 23)

Train columns: ['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']

Train sample:
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_K

In [None]:
from sklearn.preprocessing import LabelEncoder

drop_cols = ['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'job', 'dob', 'trans_num']
train_df = train_df.drop(columns=drop_cols)
test_df = test_df.drop(columns=drop_cols)

for df in [train_df, test_df]:
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day'] = df['trans_date_trans_time'].dt.day
    df['weekday'] = df['trans_date_trans_time'].dt.weekday
    df['month'] = df['trans_date_trans_time'].dt.month
    df.drop(columns=['trans_date_trans_time'], inplace=True)

categorical_cols = ['merchant', 'category', 'gender']
for col in categorical_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

print("Preprocessing done.\nTrain sample:")
print(train_df.head())


Preprocessing done.
Train sample:
   merchant  category     amt  gender      lat      long  city_pop  \
0       514         8    4.97       0  36.0788  -81.1781      3495   
1       241         4  107.23       0  48.8878 -118.2105       149   
2       390         0  220.11       1  42.1808 -112.2620      4154   
3       360         2   45.00       1  46.2306 -112.1138      1939   
4       297         9   41.96       1  38.4207  -79.4629        99   

    unix_time  merch_lat  merch_long  is_fraud  hour  day  weekday  month  
0  1325376018  36.011293  -82.048315         0     0    1        1      1  
1  1325376044  49.159047 -118.186462         0     0    1        1      1  
2  1325376051  43.150704 -112.154481         0     0    1        1      1  
3  1325376076  47.034331 -112.561071         0     0    1        1      1  
4  1325376186  38.674999  -78.632459         0     0    1        1      1  


In [None]:
X_train = train_df.drop(columns=['is_fraud'])
y_train = train_df['is_fraud']

X_test = test_df.drop(columns=['is_fraud'])
y_test = test_df['is_fraud']

print(f"Features and target split.\nX_train shape: {X_train.shape}, y_train shape: {y_train.shape}")


Features and target split.
X_train shape: (1296675, 14), y_train shape: (1296675,)


In [None]:
from sklearn.preprocessing import StandardScaler
import joblib

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

scaler_path = os.path.join(data_path, 'scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"Scaling done. Scaler saved to {scaler_path}")


Scaling done. Scaler saved to /content/drive/MyDrive/credit card fraud/scaler.pkl


In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

log_reg_path = os.path.join(data_path, 'logistic_regression_model.pkl')
joblib.dump(log_reg, log_reg_path)
print(f"Logistic Regression trained and saved to {log_reg_path}")


Logistic Regression trained and saved to /content/drive/MyDrive/credit card fraud/logistic_regression_model.pkl


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

dt_model_path = os.path.join(data_path, 'decision_tree_model.pkl')
joblib.dump(dt_model, dt_model_path)
print(f" Decision Tree trained and saved to {dt_model_path}")

 Decision Tree trained and saved to /content/drive/MyDrive/credit card fraud/decision_tree_model.pkl


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_model_path = os.path.join(data_path, 'random_forest_model.pkl')
joblib.dump(rf_model, rf_model_path)
print(f"Random Forest trained and saved to {rf_model_path}")


Random Forest trained and saved to /content/drive/MyDrive/credit card fraud/random_forest_model.pkl


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

models = {
    'Logistic Regression': log_reg,
    'Decision Tree': dt_model,
    'Random Forest': rf_model
}

for name, model in models.items():
    if name == 'Logistic Regression':
        y_pred = model.predict(X_test_scaled)
    else:
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n{name} Results:")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("Confusion Matrix:")
    print(cm)



Logistic Regression Results:
Accuracy:  0.9955
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000
Confusion Matrix:
[[553238    336]
 [  2145      0]]

Decision Tree Results:
Accuracy:  0.9951
Precision: 0.4202
Recall:    0.7166
F1 Score:  0.5297
Confusion Matrix:
[[551453   2121]
 [   608   1537]]

Random Forest Results:
Accuracy:  0.9982
Precision: 0.9281
Recall:    0.5837
F1 Score:  0.7167
Confusion Matrix:
[[553477     97]
 [   893   1252]]


In [None]:
import pandas as pd

metrics_list = []

for name, model in models.items():
    if name == 'Logistic Regression':
        y_pred = model.predict(X_test_scaled)
    else:
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    metrics_list.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1 Score': f1,
        'Confusion Matrix': cm.tolist()
    })

metrics_df = pd.DataFrame(metrics_list)

metrics_path = os.path.join(data_path, 'model_metrics.csv')
metrics_df.to_csv(metrics_path, index=False)

print(f"Metrics saved to {metrics_path}")


Metrics saved to /content/drive/MyDrive/credit card fraud/model_metrics.csv
