<a href="https://www.kaggle.com/code/gamzebayir/fraud-detection-choosing-base-model?scriptVersionId=94143153" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,cross_validate
from sklearn.preprocessing import MinMaxScaler,LabelEncoder, StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression,LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report

In [2]:
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
train_transaction=pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
train_identity=pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_transaction=pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
test_identity=pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
sample_submission=pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [4]:
train_df = train_transaction.merge(train_identity, how="left", on="TransactionID")

test_df = test_transaction.merge(test_identity, how="left", on="TransactionID")

In [5]:
test_df= test_df.rename(columns=lambda x:"_".join(x.split("-")))

### Changing data types to change the memory space of the data set.

In [6]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [7]:
train_df = reduce_mem_usage(train_df)
test_df  = reduce_mem_usage(test_df)

Mem. usage decreased to 650.48 Mb (66.8% reduction)
Mem. usage decreased to 565.37 Mb (66.3% reduction)


### Filling Missing Values


In [8]:
train_df = train_df.fillna(-999)
test_df = test_df.fillna(-999)

### Using Label Encoder to make the data suitable for the machine learning model.

In [9]:
for f in train_df.columns:
    if train_df[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values)) 

In [10]:
for f in test_df.columns:
    if test_df[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(test_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

### Creating base models

In [11]:
y = train_df["isFraud"]
X = train_df.drop(["isFraud"], axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [13]:
classifiers = [("CART", DecisionTreeClassifier()),
               ("RF", RandomForestClassifier()),
               ('Adaboost', AdaBoostClassifier()),
               ('GBM', GradientBoostingClassifier()),
               ('XGBoost', XGBClassifier()),
               ('LightGBM', LGBMClassifier()),
               ]

    
for name, classifier in classifiers:
    print(f" {name} for train")
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    y_prob = model.predict_proba(X_train)[:, 1]
    print(classification_report(y_train, y_pred))
    print(f" AUC: {roc_auc_score(y_train, y_prob)}")
    print("-----------------------------------------------------")
    print(f" {name} for test")
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    print(classification_report(y_test, y_pred))
    print(f" AUC: {roc_auc_score(y_test, y_prob)}")
    print("######################################################")

 CART for train
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    398914
           1       1.00      1.00      1.00     14464

    accuracy                           1.00    413378
   macro avg       1.00      1.00      1.00    413378
weighted avg       1.00      1.00      1.00    413378

 AUC: 1.0
-----------------------------------------------------
 CART for test
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    170963
           1       0.54      0.59      0.56      6199

    accuracy                           0.97    177162
   macro avg       0.76      0.78      0.77    177162
weighted avg       0.97      0.97      0.97    177162

 AUC: 0.7837579656578049
######################################################
 RF for train
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    398914
           1       1.00      1.00      1.00     

#### In this case, choosing the LightGBM and XGBoost models was considered appropriate.

### Modeling XGBoost

In [14]:
xgboost_model = XGBClassifier(random_state=17).fit(X_train, y_train)
y_pred = xgboost_model.predict(X_test)
y_prob = xgboost_model.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
roc_auc_score(y_test, y_prob)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    170963
           1       0.92      0.49      0.64      6199

    accuracy                           0.98    177162
   macro avg       0.95      0.75      0.82    177162
weighted avg       0.98      0.98      0.98    177162



0.9406964804423688

In [15]:
predictions_xgboost = xgboost_model.predict_proba(test_df)[:,1]
submission = pd.DataFrame({'TransactionID':test_transaction['TransactionID'],'isFraud':predictions_xgboost})
submission["TransactionID"]=submission["TransactionID"].astype(int)
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.005178
1,3663550,0.002008
2,3663551,0.002815
3,3663552,0.001957
4,3663553,0.001499


### Saving submission file 

In [16]:
filename = 'CIS Fraud Detection base model xgboost.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: CIS Fraud Detection base model xgboost.csv


In [17]:
# Predict_proba
# Score: 0.891545
# Public score: 0.916079

### Modeling LightGBM

In [18]:
lgbm_model = LGBMClassifier(random_state=17).fit(X_train, y_train)
y_pred = lgbm_model.predict(X_test)
y_prob = lgbm_model.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
roc_auc_score(y_test, y_prob)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    170963
           1       0.89      0.44      0.59      6199

    accuracy                           0.98    177162
   macro avg       0.94      0.72      0.79    177162
weighted avg       0.98      0.98      0.97    177162



0.927581425940798

In [19]:
predictions_lgbm = lgbm_model.predict_proba(test_df)[:,1]
submission = pd.DataFrame({'TransactionID':test_transaction['TransactionID'],'isFraud':predictions_lgbm})
submission["TransactionID"]=submission["TransactionID"].astype(int)
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.002412
1,3663550,0.00556
2,3663551,0.015577
3,3663552,0.002708
4,3663553,0.002984


### Saving submission file 

In [20]:
filename = 'CIS Fraud Detection base model lgbm.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: CIS Fraud Detection base model lgbm.csv


In [21]:
# Predict_proba
# Private Score: 0.894476
# Public score: 0.920603