In [1]:
import numpy
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import precision_recall_curve

In [2]:
train_identify = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
train_transac = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [3]:
train_identify.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [4]:
train_transac.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train_identify.isnull().sum()

TransactionID         0
id_01                 0
id_02              3361
id_03             77909
id_04             77909
id_05              7368
id_06              7368
id_07            139078
id_08            139078
id_09             69307
id_10             69307
id_11              3255
id_12                 0
id_13             16913
id_14             64189
id_15              3248
id_16             14893
id_17              4864
id_18             99120
id_19              4915
id_20              4972
id_21            139074
id_22            139064
id_23            139064
id_24            139486
id_25            139101
id_26            139070
id_27            139064
id_28              3255
id_29              3255
id_30             66668
id_31              3951
id_32             66647
id_33             70944
id_34             66428
id_35              3248
id_36              3248
id_37              3248
id_38              3248
DeviceType         3423
DeviceInfo        25567
dtype: int64

In [6]:
train_identify.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144233 entries, 0 to 144232
Data columns (total 41 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TransactionID  144233 non-null  int64  
 1   id_01          144233 non-null  float64
 2   id_02          140872 non-null  float64
 3   id_03          66324 non-null   float64
 4   id_04          66324 non-null   float64
 5   id_05          136865 non-null  float64
 6   id_06          136865 non-null  float64
 7   id_07          5155 non-null    float64
 8   id_08          5155 non-null    float64
 9   id_09          74926 non-null   float64
 10  id_10          74926 non-null   float64
 11  id_11          140978 non-null  float64
 12  id_12          144233 non-null  object 
 13  id_13          127320 non-null  float64
 14  id_14          80044 non-null   float64
 15  id_15          140985 non-null  object 
 16  id_16          129340 non-null  object 
 17  id_17          139369 non-nul

In [7]:
train_transac.isnull().sum()

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
V335              508189
V336              508189
V337              508189
V338              508189
V339              508189
Length: 394, dtype: int64

In [8]:
train_df = pd.merge(train_transac, train_identify, how ='left')

In [9]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float64(399), int64(4), object(31)
memory usage: 1.9+ GB


In [11]:
train_df.isnull().sum()

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
id_36             449555
id_37             449555
id_38             449555
DeviceType        449730
DeviceInfo        471874
Length: 434, dtype: int64

In [12]:
print(train_df.shape)
threshold = len(train_df) * 0.8
train_df = train_df.dropna(axis=1, thresh=threshold)

(590540, 434)


In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 182 entries, TransactionID to V321
dtypes: float64(174), int64(4), object(4)
memory usage: 820.0+ MB


In [14]:
train_df.shape

(590540, 182)

In [15]:
numerical_cols = train_df.select_dtypes(include=['number']).columns
train_df[numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].mean())

categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    mode_value = train_df[col].mode()[0] 
    train_df[col].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(mode_value, inplace=True)


In [16]:
train_df.isnull().sum().sum()

0

In [17]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,86400,68.5,W,13926,362.555488,150.0,discover,142.0,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)

In [19]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,P_emaildomain_web.de,P_emaildomain_windstream.net,P_emaildomain_yahoo.co.jp,P_emaildomain_yahoo.co.uk,P_emaildomain_yahoo.com,P_emaildomain_yahoo.com.mx,P_emaildomain_yahoo.de,P_emaildomain_yahoo.es,P_emaildomain_yahoo.fr,P_emaildomain_ymail.com
0,2987000,0,86400,68.5,13926,362.555488,150.0,142.0,315.0,87.0,...,False,False,False,False,False,False,False,False,False,False
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,False,False,False,False,False,False,False,False,False,False
2,2987002,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,...,False,False,False,False,False,False,False,False,False,False
3,2987003,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,...,False,False,False,False,True,False,False,False,False,False
4,2987004,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,...,False,False,False,False,False,False,False,False,False,False


In [20]:
train_df.shape

(590540, 246)

In [21]:
X = train_df.drop(['TransactionID', 'isFraud'], axis = 1)
y = train_df['isFraud']

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [23]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(k_neighbors=5)
X_smote, y_smote = oversample.fit_resample(X_train, y_train)
X_train, y_train = X_smote, y_smote

In [24]:
y_train.value_counts()

isFraud
0    455903
1    455903
Name: count, dtype: int64

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss


rf = RandomForestClassifier(random_state=34)
rf.fit(X_train,y_train)


preds_proba = rf.predict_proba(X_test)[:, 1]

In [26]:
import numpy as np
# Ensure predictions are bounded between 1e-15 and 1 - 1e-15
preds_proba = np.clip(preds_proba, 1e-15, 1 - 1e-15)

# Calculate Log Loss
log_loss_value = log_loss(y_test, preds_proba)
print("Log Loss: ", log_loss_value)

Log Loss:  0.12152166979916634


In [27]:
from sklearn.metrics import accuracy_score

preds = rf.predict(X_test)
print(accuracy_score(preds,y_test))

0.9793748094963931


In [None]:
# import cudf
# import cuml
# import cupy as cp
# from cuml.preprocessing import StandardScaler
# from cuml.ensemble import RandomForestClassifier
# from cuml.metrics import accuracy_score, log_loss
# from cuml.model_selection import train_test_split

# # Load data using cuDF
# train_identify = cudf.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
# train_transac = cudf.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

# # Merge dataframes
# train_df = train_transac.merge(train_identify, how='left', on='TransactionID')

# # Data preprocessing
# threshold = len(train_df) * 0.8
# train_df = train_df.dropna(axis=1, thresh=threshold)

# # Fill NaN values
# numerical_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
# train_df[numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].mean())

# categorical_cols = train_df.select_dtypes(include=['object']).columns
# for col in categorical_cols:
#     mode_value = train_df[col].mode().iloc[0]
#     train_df[col] = train_df[col].fillna(mode_value)

# # One-hot encoding without drop_first
# train_df = cudf.get_dummies(train_df, columns=categorical_cols)

# # Manually drop the first dummy variable for each category
# for col in categorical_cols:
#     dummy_cols = [c for c in train_df.columns if c.startswith(f"{col}_")]
#     if len(dummy_cols) > 1:
#         train_df = train_df.drop(dummy_cols[0], axis=1)

# # Prepare features and target
# X = train_df.drop(['TransactionID', 'isFraud'], axis=1)
# y = train_df['isFraud']

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# # Train Random Forest
# rf = RandomForestClassifier(random_state=34)
# rf.fit(X_train, y_train)

# # Make predictions
# preds_proba = rf.predict_proba(X_test)[:, 1]

# # Ensure predictions are bounded
# preds_proba = cp.clip(preds_proba, 1e-15, 1 - 1e-15)

# # Calculate Log Loss
# log_loss_value = log_loss(y_test, preds_proba)
# print("Log Loss: ", log_loss_value)

# # Calculate Accuracy
# preds = rf.predict(X_test)
# print("Accuracy: ", accuracy_score(y_test, preds))