In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
import dill
dill.settings["recurse"] = True
RANDOM_STATE = 42

# **Dataset**

In [3]:
df = pd.read_csv(r"C:\Users\HP\Downloads\data.csv", index_col = 0)

In [4]:
df.head()

Unnamed: 0,location,num_of_unique_IPs_used,login_count,num_of_frequent_operations,c2c_place_order_count,c2c_release_order_count,gift_card_created_amount,gift_card_redeemed_amount,amount,wallet_balance,wallet_free_balance,wallet_locked_balance,deposit_status,transaction_time,prev_transaction_time,account_age_days,category
0,Islamabad,3,5,13,9,5,0.0,0.0,812.21,4704.45,2509.84,940.56,Completed,2024-02-28 14:27:13,2024-02-02 08:44:25,607,normal
1,Karachi,3,5,17,7,5,0.0,0.0,1.0,500000.0,2.0,0.0,Completed,2024-04-07 19:37:57,2024-03-13 23:29:48,304,normal
2,Rawalpindi,3,11,20,7,1,0.0,37.51,160.24,1715.62,2372.89,832.61,Completed,2024-12-12 19:35:17,2024-12-05 17:28:14,951,normal
3,Karachi,2,14,1,4,1,0.0,0.0,399.07,3786.38,37.55,158.57,Pending,2024-09-26 15:42:53,2024-09-16 09:36:35,204,normal
4,Peshawar,15,22,61,29,21,173.47,0.0,11608.12,42543.26,22230.91,6296.09,Completed,2024-10-02 12:55:20,2024-09-29 11:25:16,229,anomalous


# **4. Creating The Pipeline**

In [5]:
categorical_columns = ['location', 'deposit_status']

numerical_columns = ['num_of_unique_IPs_used', 'login_count', 'num_of_frequent_operations', 'c2c_place_order_count',
        'c2c_release_order_count',  'gift_card_created_amount', 'gift_card_redeemed_amount',
        'amount', 'wallet_balance', 'wallet_free_balance', 'wallet_locked_balance',
        'account_age_days']

In [6]:
print("Categorical Columns:", categorical_columns, type(categorical_columns))
print("Numerical Columns:", numerical_columns, type(numerical_columns))


Categorical Columns: ['location', 'deposit_status'] <class 'list'>
Numerical Columns: ['num_of_unique_IPs_used', 'login_count', 'num_of_frequent_operations', 'c2c_place_order_count', 'c2c_release_order_count', 'gift_card_created_amount', 'gift_card_redeemed_amount', 'amount', 'wallet_balance', 'wallet_free_balance', 'wallet_locked_balance', 'account_age_days'] <class 'list'>


In [7]:
class TransactionTimeDifference(BaseEstimator, TransformerMixin):
  def fit(self, X,y=None):
    return self

  def transform(self,X):
    X = X.copy()
    X['transaction_time'] = pd.to_datetime(X['transaction_time'], errors = 'coerce')
    X['prev_transaction_time'] = pd.to_datetime(X['prev_transaction_time'], errors = 'coerce')
    X['time_between_last_2_trans_(sec)'] = (X['transaction_time'] - X['prev_transaction_time']).dt.total_seconds()
    X['time_between_last_2_trans_(sec)'].fillna(0)
    # drop original columns
    X = X.drop(columns = ['transaction_time', 'prev_transaction_time'])

    #print("Columns after TransactionTimeDifference:", X.columns)
    return X

In [8]:
numerical_transformer = Pipeline(
    steps = [("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps = [("one_hot_encode", OneHotEncoder(handle_unknown="ignore"))]
)


preprocessor = ColumnTransformer(
    transformers = [
        ("categorical", categorical_transformer, categorical_columns),
        ("numeric", numerical_transformer, numerical_columns+['time_between_last_2_trans_(sec)']),
    ],
    remainder = "drop",
    n_jobs = -1
)

In [9]:
pipeline = Pipeline(
    steps = [
        ("transaction_time_dif", TransactionTimeDifference()),
        ("preprocessor", preprocessor),
        ("xgb_classifier", XGBClassifier(objective= "multi:softprob",random_state = RANDOM_STATE))
    ]
)

# **6. Training The Model**

In [10]:
df.columns

Index(['location', 'num_of_unique_IPs_used', 'login_count',
       'num_of_frequent_operations', 'c2c_place_order_count',
       'c2c_release_order_count', 'gift_card_created_amount',
       'gift_card_redeemed_amount', 'amount', 'wallet_balance',
       'wallet_free_balance', 'wallet_locked_balance', 'deposit_status',
       'transaction_time', 'prev_transaction_time', 'account_age_days',
       'category'],
      dtype='object')

In [11]:
X = df.drop("category", axis = 1)
y = df["category"]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state=RANDOM_STATE)

In [13]:
X_train.columns

Index(['location', 'num_of_unique_IPs_used', 'login_count',
       'num_of_frequent_operations', 'c2c_place_order_count',
       'c2c_release_order_count', 'gift_card_created_amount',
       'gift_card_redeemed_amount', 'amount', 'wallet_balance',
       'wallet_free_balance', 'wallet_locked_balance', 'deposit_status',
       'transaction_time', 'prev_transaction_time', 'account_age_days'],
      dtype='object')

In [14]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [15]:
pipeline.fit(X_train, y_train)

In [16]:
y_pred_encoded = pipeline.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

print("Accuracy score: ", accuracy_score(y_test, y_pred_encoded))
print("\nClassification Report:\n", classification_report(y_test, y_pred_encoded))

Accuracy score:  1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1500
           1       1.00      1.00      1.00      1000
           2       1.00      1.00      1.00      7500

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



In [17]:
print('Training set score: ' + str(pipeline.score(X_train,y_train)))
print('Test set score: ' + str(pipeline.score(X_test,y_test)))

Training set score: 1.0
Test set score: 1.0


In [18]:
save_path = r"C:\Users\HP\Desktop\Python\Data_Science_Projects\fradulent-transaction-detection\pipeline1.pkl"
with open(save_path, 'wb') as file:
  dill.dump(pipeline, file)

# **Interpreting Model Result Using SHAP**