In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv(r"C:\Users\85job\OneDrive\Desktop\datafolder\creditcard\fraudTrain.csv")#you should change the path . it is from my own device
test_df = pd.read_csv(r"C:\Users\85job\OneDrive\Desktop\datafolder\creditcard\fraudTest.csv")#you should change the path . it is from my own device

In [3]:
print(train_df.head())

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [4]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [6]:
X_train = train_df.drop('is_fraud', axis=1)
y_train = train_df['is_fraud']
X_test = test_df.drop('is_fraud', axis=1)
y_test = test_df['is_fraud']

In [7]:
# Convert to datetime
X_train['trans_date_trans_time'] = pd.to_datetime(X_train['trans_date_trans_time'])
X_test['trans_date_trans_time'] = pd.to_datetime(X_test['trans_date_trans_time'])

# Extract datetime features
X_train['trans_year'] = X_train['trans_date_trans_time'].dt.year
X_train['trans_month'] = X_train['trans_date_trans_time'].dt.month
X_train['trans_day'] = X_train['trans_date_trans_time'].dt.day
X_train['trans_hour'] = X_train['trans_date_trans_time'].dt.hour

X_test['trans_year'] = X_test['trans_date_trans_time'].dt.year
X_test['trans_month'] = X_test['trans_date_trans_time'].dt.month
X_test['trans_day'] = X_test['trans_date_trans_time'].dt.day
X_test['trans_hour'] = X_test['trans_date_trans_time'].dt.hour

# Drop original datetime column
X_train = X_train.drop('trans_date_trans_time', axis=1)
X_test = X_test.drop('trans_date_trans_time', axis=1)

In [8]:
# Combine training and test data temporarily for encoding
combined_df = pd.concat([X_train, X_test])

# Label encode categorical features
label_enc = LabelEncoder()
for col in ['merchant', 'category', 'gender', 'job','city','state']:
    # Fit on combined data, then transform both train and test
    label_enc.fit(combined_df[col].astype(str))
    X_train[col] = label_enc.transform(X_train[col].astype(str))
    X_test[col] = label_enc.transform(X_test[col].astype(str))

In [9]:
scaler = StandardScaler()
numerical_features = ['amt', 'city_pop', 'lat', 'long', 'merch_lat', 'merch_long']

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

In [10]:
# Convert to datetime and calculate age
X_train['dob'] = pd.to_datetime(X_train['dob'])
X_test['dob'] = pd.to_datetime(X_test['dob'])

X_train['age'] = 2024 - X_train['dob'].dt.year  
X_test['age'] = 2024 - X_test['dob'].dt.year

# Drop the 'dob' column
X_train = X_train.drop('dob', axis=1)
X_test = X_test.drop('dob', axis=1)

In [11]:
X_train = X_train.drop(['Unnamed: 0', 'cc_num', 'trans_num'], axis=1)
X_test = X_test.drop(['Unnamed: 0', 'cc_num', 'trans_num'], axis=1)

In [12]:
X_train = X_train.drop(columns=['first'])
X_test = X_test.drop(columns=['first'])
X_train = X_train.drop(columns=['last'])
X_test = X_test.drop(columns=['last'])
X_train = X_train.drop(columns=['street'])
X_test = X_test.drop(columns=['street'])

In [13]:
print(X_test.dtypes)

merchant         int32
category         int32
amt            float64
gender           int32
city             int32
state            int32
zip              int64
lat            float64
long           float64
city_pop       float64
job              int32
unix_time        int64
merch_lat      float64
merch_long     float64
trans_year       int64
trans_month      int64
trans_day        int64
trans_hour       int64
age              int64
dtype: object


In [14]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

In [15]:
y_pred = clf.predict(X_test)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix


print(classification_report(y_test, y_pred))


print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       1.00      0.51      0.67      2145

    accuracy                           1.00    555719
   macro avg       1.00      0.75      0.84    555719
weighted avg       1.00      1.00      1.00    555719

[[553570      4]
 [  1058   1087]]


In [17]:
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of class 1
threshold = 0.3  # Lower threshold
y_pred_new = (y_proba >= threshold).astype(int)
print(classification_report(y_test, y_pred_new))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.95      0.71      0.81      2145

    accuracy                           1.00    555719
   macro avg       0.98      0.85      0.91    555719
weighted avg       1.00      1.00      1.00    555719

