In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#modeling
import os
import sys
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings


In [7]:
df = pd.read_csv(r"data\dataset.csv")
df.head()


Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,14,50.01,4,Visa,226,1909.29,Biometric,0.84,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.2,Tablet,New York,Clothing,0,...,8,182.48,4,Visa,76,1311.86,OTP,0.7935,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,14,328.69,4,Mastercard,140,966.98,Password,0.3819,1,1


In [8]:
df.dtypes

Transaction_ID                   object
User_ID                          object
Transaction_Amount              float64
Transaction_Type                 object
Timestamp                        object
Account_Balance                 float64
Device_Type                      object
Location                         object
Merchant_Category                object
IP_Address_Flag                   int64
Previous_Fraudulent_Activity      int64
Daily_Transaction_Count           int64
Avg_Transaction_Amount_7d       float64
Failed_Transaction_Count_7d       int64
Card_Type                        object
Card_Age                          int64
Transaction_Distance            float64
Authentication_Method            object
Risk_Score                      float64
Is_Weekend                        int64
Fraud_Label                       int64
dtype: object

In [9]:
df['Transaction_Type'].unique()

array(['POS', 'Bank Transfer', 'Online', 'ATM Withdrawal'], dtype=object)

In [10]:
df.drop(columns=['Risk_Score'],inplace=True)

In [11]:
df.nunique()

Transaction_ID                  50000
User_ID                          8963
Transaction_Amount              21763
Transaction_Type                    4
Timestamp                       47724
Account_Balance                 49867
Device_Type                         3
Location                            5
Merchant_Category                   5
IP_Address_Flag                     2
Previous_Fraudulent_Activity        2
Daily_Transaction_Count            14
Avg_Transaction_Amount_7d       31420
Failed_Transaction_Count_7d         5
Card_Type                           4
Card_Age                          239
Transaction_Distance            47546
Authentication_Method               4
Is_Weekend                          2
Fraud_Label                         2
dtype: int64

In [12]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])


In [13]:
df.head()

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,0,7,437.63,3,Amex,65,883.17,Biometric,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,0,13,478.76,4,Mastercard,186,2203.36,Password,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,0,14,50.01,4,Visa,226,1909.29,Biometric,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.2,Tablet,New York,Clothing,0,0,8,182.48,4,Visa,76,1311.86,OTP,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,1,14,328.69,4,Mastercard,140,966.98,Password,1,1


In [14]:
df = df.sort_values(by=['User_ID','Timestamp']).reset_index(drop=True)

# tx_count_last_24h avg_amount_7d failed_tx_count_7d has_prior_fraud is_new_user

In [15]:
df_temp = df.set_index('Timestamp')

In [16]:
df.columns

Index(['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type',
       'Timestamp', 'Account_Balance', 'Device_Type', 'Location',
       'Merchant_Category', 'IP_Address_Flag', 'Previous_Fraudulent_Activity',
       'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d',
       'Failed_Transaction_Count_7d', 'Card_Type', 'Card_Age',
       'Transaction_Distance', 'Authentication_Method', 'Is_Weekend',
       'Fraud_Label'],
      dtype='object')

In [17]:
# 1. Count of transactions per user (fraud frequency)
df['user_transaction_count'] = df.groupby('User_ID')['User_ID'].transform('count')

# 2. Fraud rate per user (historical fraud behavior)
df['user_fraud_rate'] = df.groupby('User_ID')['Fraud_Label'].transform('mean')

# 3. User's average transaction amount
df['user_avg_amount'] = df.groupby('User_ID')['Transaction_Amount'].transform('mean')

# 4. User's std deviation of transaction amount (consistency)
df['user_amount_std'] = df.groupby('User_ID')['Transaction_Amount'].transform('std')

# 5. Deviation from user's normal behavior
df['amount_deviation_from_user_avg'] = (
    df['Transaction_Amount'] - df['user_avg_amount']
) / (df['user_amount_std'] + 1e-5)  # Add small value to avoid division by zero

# 6. Days since user's first transaction (account age proxy)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['days_since_first_transaction'] = (
    df.groupby('User_ID')['Timestamp']
    .transform(lambda x: (x - x.min()).dt.total_seconds() / 86400)
)

# NOW remove User_ID
df = df.drop('User_ID', axis=1)

In [18]:
df['is_single_transaction_user'] = df['user_amount_std'].isna().astype(int)
df.head()
df['user_amount_std'] = df['user_amount_std'].fillna(0)
df['amount_deviation_from_user_avg'] = df['amount_deviation_from_user_avg'].fillna(0)

In [19]:
df.drop(columns=['Transaction_ID'],axis=1,inplace=True)


In [20]:
df.nunique()

Transaction_Amount                21763
Transaction_Type                      4
Timestamp                         47724
Account_Balance                   49867
Device_Type                           3
Location                              5
Merchant_Category                     5
IP_Address_Flag                       2
Previous_Fraudulent_Activity          2
Daily_Transaction_Count              14
Avg_Transaction_Amount_7d         31420
Failed_Transaction_Count_7d           5
Card_Type                             4
Card_Age                            239
Transaction_Distance              47546
Authentication_Method                 4
Is_Weekend                            2
Fraud_Label                           2
user_transaction_count               16
user_fraud_rate                      52
user_avg_amount                    8821
user_amount_std                    8767
amount_deviation_from_user_avg    49793
days_since_first_transaction      39414
is_single_transaction_user            2


In [21]:
df['hour'] = pd.to_datetime(df['Timestamp']).dt.hour
df['day_of_week'] = pd.to_datetime(df['Timestamp']).dt.dayofweek
df['day_of_month'] = pd.to_datetime(df['Timestamp']).dt.month
df.drop('Timestamp',axis=1,inplace=True)

In [22]:
df.nunique()

Transaction_Amount                21763
Transaction_Type                      4
Account_Balance                   49867
Device_Type                           3
Location                              5
Merchant_Category                     5
IP_Address_Flag                       2
Previous_Fraudulent_Activity          2
Daily_Transaction_Count              14
Avg_Transaction_Amount_7d         31420
Failed_Transaction_Count_7d           5
Card_Type                             4
Card_Age                            239
Transaction_Distance              47546
Authentication_Method                 4
Is_Weekend                            2
Fraud_Label                           2
user_transaction_count               16
user_fraud_rate                      52
user_avg_amount                    8821
user_amount_std                    8767
amount_deviation_from_user_avg    49793
days_since_first_transaction      39414
is_single_transaction_user            2
hour                                 24


In [23]:
x = df.drop('Fraud_Label',axis=1)
y = df['Fraud_Label']

In [24]:
one_hot_enc = list()
num = list()
for i in x.columns:
    if x[i].nunique() <= 15:
        one_hot_enc.append(i)
    else:
        num.append(i)

one_hot_enc

['Transaction_Type',
 'Device_Type',
 'Location',
 'Merchant_Category',
 'IP_Address_Flag',
 'Previous_Fraudulent_Activity',
 'Daily_Transaction_Count',
 'Failed_Transaction_Count_7d',
 'Card_Type',
 'Authentication_Method',
 'Is_Weekend',
 'is_single_transaction_user',
 'day_of_week',
 'day_of_month']

In [25]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
onehot = OneHotEncoder()

processor = ColumnTransformer(
    [
        ('One hot encoder',onehot,one_hot_enc),
        ('Standard Scaler',scaler,num)
    ]
)

In [26]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [27]:
x_train = processor.fit_transform(x_train)
x_test = processor.transform(x_test)

In [28]:
from sklearn.metrics import mean_squared_error,r2_score

In [29]:
def evaluate_model(true,pred):
    mae = mean_squared_error(true,pred)
    score = r2_score(true,pred)
    return mae,score

In [30]:
df.isnull().sum()

Transaction_Amount                0
Transaction_Type                  0
Account_Balance                   0
Device_Type                       0
Location                          0
Merchant_Category                 0
IP_Address_Flag                   0
Previous_Fraudulent_Activity      0
Daily_Transaction_Count           0
Avg_Transaction_Amount_7d         0
Failed_Transaction_Count_7d       0
Card_Type                         0
Card_Age                          0
Transaction_Distance              0
Authentication_Method             0
Is_Weekend                        0
Fraud_Label                       0
user_transaction_count            0
user_fraud_rate                   0
user_avg_amount                   0
user_amount_std                   0
amount_deviation_from_user_avg    0
days_since_first_transaction      0
is_single_transaction_user        0
hour                              0
day_of_week                       0
day_of_month                      0
dtype: int64

In [31]:
models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree" : DecisionTreeRegressor(),
                "Gradient Boosting" : GradientBoostingRegressor(),
                "KNeighborsRegressor" : KNeighborsRegressor(),
                "XGB Regressor" : XGBRegressor(),
                "Adaboost Regressor ": AdaBoostRegressor(),
                "linear Regressor " :LinearRegression()
            }
model_list = []
r2_list = []
for name , model in models.items():
    print(f"Training {name} model")
    model.fit(x_train,y_train)

    #make prediction
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    #Evaluate Train and Test dataset
    model_train_mae,model_train_score = evaluate_model(y_train,y_train_pred)
    model_test_mae,model_test_score = evaluate_model(y_test,y_test_pred)

    model_list.append(name)
    r2_list.append(model_test_score)

Training Random Forest model


KeyboardInterrupt: 

In [None]:
for model_name,score in zip(model_list,r2_list):
    print(f"{model_name} : {score:.4f}")

LinearRegression : 0.5734
Lasso : -0.0003
Ridge : 0.5734
K-Neighbours Regressor : 0.2966
DecisionTreeRegressor : 0.1247
RandomForestRegressor : 0.5670
XGBRegressor : 0.5540
AdaBoostRegressor : 0.5807


In [32]:
from sklearn.model_selection import RandomizedSearchCV

params = {
                "Random Forest" : {
                    "n_estimators": [100, 200, 300, 500],
                    "max_depth": [None, 5, 10, 20, 30],
                    "min_samples_split": [2, 5, 10],
                    "min_samples_leaf": [1, 2, 4],
                    "max_features": [None, "sqrt", "log2"]
                },
                
                "Gradient Boosting" : {
                    "n_estimators": [100, 200, 300],
                    "learning_rate": [0.01, 0.05, 0.1, 0.2],
                    "subsample": [0.6, 0.8, 1.0],
                    "max_depth": [3, 4, 5],
                    "min_samples_split": [2, 5, 10],
                    "min_samples_leaf": [1, 2, 4],
                    "max_features": ["sqrt", "log2"]
                },
                "K-Neighbours Regressor" : {
                    "n_neighbors": [3, 5, 7, 9, 11],
                    "weights": ["uniform", "distance"],
                    "metric": ["euclidean", "manhattan", "minkowski"],
                    "p": [1, 2]   
                },
                "XGB Regressor" : {
                    "n_estimators": [200, 400, 600],
                    "learning_rate": [0.01, 0.05, 0.1],
                    "max_depth": [3, 5, 7, 9],
                    "subsample": [0.6, 0.8, 1.0],
                    "colsample_bytree": [0.6, 0.8, 1.0],
                    "reg_alpha": [0, 0.01, 0.1],
                    "reg_lambda": [0.5, 1, 2]
                },
                "Adaboost Regressor " : {
                    "n_estimators": [50, 100, 200, 400],
                    "learning_rate": [0.01, 0.05, 0.1, 0.5, 1.0],
                    "loss": ["linear", "square", "exponential"]
                }

                
            }
models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree" : DecisionTreeRegressor(),
                "Gradient Boosting" : GradientBoostingRegressor(),
                "KNeighborsRegressor" : KNeighborsRegressor(),
                "XGB Regressor" : XGBRegressor(),
                "Adaboost Regressor ": AdaBoostRegressor(),
                "linear Regressor " :LinearRegression()
            }

In [34]:
def evaluate_model(x_train,y_train,x_test,y_test,params,models):
    report={}
    for name,model in models.items():
        print(f"{name}")
        param = params.get(name,None)
        if param:
            rs = RandomizedSearchCV(estimator=model,param_distributions=param,cv=5,random_state=42,n_jobs=-1)
            rs.fit(x_train,y_train)
            model.set_params(**rs.best_params_)
        model.fit(x_train,y_train)
        y_train_pred = model.predict(x_train)

        y_test_pred = model.predict(x_test)

        train_model_score = r2_score(y_train,y_train_pred)

        test_model_score = r2_score(y_test,y_test_pred)

        report[name] = test_model_score
    return report
    
    

        

In [None]:
report = evaluate_model(x_train,y_train,x_test,y_test,params,models=models)

Random Forest


In [None]:
len(x_train)
len(y_train)
len(x_test)


10000

In [None]:
best_model_score = max(report.values())

AttributeError: 'NoneType' object has no attribute 'values'