In [93]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

test_df = pd.read_csv('live_test_data.csv') 

In [94]:
N_TOTAL = 100000 
N_USERS = 1000
N_NORMAL = int(N_TOTAL * 0.30) 
CITIES = ['Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata', 'Pune', 'Hyderabad']
CATEGORIES = ['Grocery', 'Utilities', 'Travel', 'Electronics', 'Luxury', 'Cash_Advance']

np.random.seed(42) 

user_ids = np.random.randint(1, N_USERS + 1, N_NORMAL)
start_date = datetime(2025, 1, 1)
timestamps = [start_date + timedelta(seconds=np.random.randint(0, 30*24*60*60)) for _ in range(N_NORMAL)]

amounts = np.random.lognormal(mean=5, sigma=1.5, size=N_NORMAL)
amounts = np.round(amounts, 0).astype(int)
amounts[amounts < 10] = 10.00 

df_normal = pd.DataFrame({
    'Timestamp': timestamps,
    'UserID': user_ids,
    'Amount': amounts,
    'City': np.random.choice(CITIES, N_NORMAL),
    'Category': np.random.choice(CATEGORIES, N_NORMAL, p=[0.4, 0.2, 0.15, 0.1, 0.1, 0.05]),
    'Fraud_Type': 0  
})

df_normal.sort_values(['UserID', 'Timestamp'], inplace=True)
df_normal.reset_index(drop=True, inplace=True)

df_normal

Unnamed: 0,Timestamp,UserID,Amount,City,Category,Fraud_Type
0,2025-01-01 00:09:30,1,183,Delhi,Grocery,0
1,2025-01-03 10:25:39,1,97,Pune,Grocery,0
2,2025-01-03 19:21:02,1,262,Chennai,Electronics,0
3,2025-01-04 00:28:34,1,16,Hyderabad,Electronics,0
4,2025-01-04 21:12:14,1,307,Mumbai,Cash_Advance,0
...,...,...,...,...,...,...
29995,2025-01-27 11:50:33,1000,89,Kolkata,Travel,0
29996,2025-01-28 15:04:42,1000,339,Delhi,Travel,0
29997,2025-01-28 15:15:08,1000,519,Bangalore,Utilities,0
29998,2025-01-29 21:01:46,1000,636,Bangalore,Travel,0


In [95]:
N_TOTAL_RECORDS = 100000 
N_MAGNITUDE_FRAUD = int(N_TOTAL_RECORDS * 0.15)


HIGH_AMOUNT_LOWER = 100000  
HIGH_AMOUNT_UPPER = 1000000  

np.random.seed(43) 
df = df_normal.copy()
fraud_indices = df.index.to_series().sample(n=N_MAGNITUDE_FRAUD, replace=False).index

new_fraud_amounts = np.random.uniform(
    low=HIGH_AMOUNT_LOWER, 
    high=HIGH_AMOUNT_UPPER, 
    size=N_MAGNITUDE_FRAUD
)
df.loc[fraud_indices, 'Amount'] = np.round(new_fraud_amounts, 0).astype(int)


df.loc[fraud_indices, 'Fraud_Type'] = 1 

df

Unnamed: 0,Timestamp,UserID,Amount,City,Category,Fraud_Type
0,2025-01-01 00:09:30,1,754564,Delhi,Grocery,1
1,2025-01-03 10:25:39,1,426647,Pune,Grocery,1
2,2025-01-03 19:21:02,1,102941,Chennai,Electronics,1
3,2025-01-04 00:28:34,1,498411,Hyderabad,Electronics,1
4,2025-01-04 21:12:14,1,195041,Mumbai,Cash_Advance,1
...,...,...,...,...,...,...
29995,2025-01-27 11:50:33,1000,89,Kolkata,Travel,0
29996,2025-01-28 15:04:42,1000,339,Delhi,Travel,0
29997,2025-01-28 15:15:08,1000,519,Bangalore,Utilities,0
29998,2025-01-29 21:01:46,1000,612918,Bangalore,Travel,1


In [96]:

N_VELOCITY_FRAUD = 15000 
N_USERS_TO_ATTACK = 300 


np.random.seed(44) 
attack_users = np.random.choice(df['UserID'].unique(), N_USERS_TO_ATTACK, replace=False)

fraud_data = []
for user_id in attack_users:
    
    user_txns = df[df['UserID'] == user_id]
    
    if not user_txns.empty:
        start_time = user_txns['Timestamp'].max() + timedelta(minutes=5)
        city = user_txns['City'].iloc[-1]
        category = 'Cash_Advance' 
    else:
        start_time = datetime(2025, 1, 15)
        city = np.random.choice(CITIES)
        category = 'Cash_Advance'

    for i in range(10):
        txn_time = start_time + timedelta(seconds=np.random.randint(1, 60))
        amount = np.random.randint(low=500, high=2500)
        
        fraud_data.append([txn_time, user_id, amount, city, category, 2]) 

df_velocity_fraud = pd.DataFrame(fraud_data, columns=df.columns)
df_final = pd.concat([df, df_velocity_fraud], ignore_index=True)

df_final.sort_values(['UserID', 'Timestamp'], inplace=True)
df_final.reset_index(drop=True, inplace=True)


df = df_final

df

Unnamed: 0,Timestamp,UserID,Amount,City,Category,Fraud_Type
0,2025-01-01 00:09:30,1,754564,Delhi,Grocery,1
1,2025-01-03 10:25:39,1,426647,Pune,Grocery,1
2,2025-01-03 19:21:02,1,102941,Chennai,Electronics,1
3,2025-01-04 00:28:34,1,498411,Hyderabad,Electronics,1
4,2025-01-04 21:12:14,1,195041,Mumbai,Cash_Advance,1
...,...,...,...,...,...,...
32995,2025-01-27 11:50:33,1000,89,Kolkata,Travel,0
32996,2025-01-28 15:04:42,1000,339,Delhi,Travel,0
32997,2025-01-28 15:15:08,1000,519,Bangalore,Utilities,0
32998,2025-01-29 21:01:46,1000,612918,Bangalore,Travel,1


In [97]:
CITY_COORDS = {
    'Mumbai': (19.0760, 72.8777),
    'Delhi': (28.7041, 77.1025),
    'Bangalore': (12.9716, 77.5946),
    'Chennai': (13.0827, 80.2707),
    'Kolkata': (22.5726, 88.3639),
    'Pune': (18.5204, 73.8567),
    'Hyderabad': (17.3850, 78.4867),
    'Ahmedabad': (23.0225, 72.5714)
}
MAX_SPEED_KMH = 1000 
MAX_SPEED_KMS = MAX_SPEED_KMH / 3600 

def haversine_distance(coord1, coord2):
    R = 6371  
    lat1, lon1 = np.radians(coord1)
    lat2, lon2 = np.radians(coord2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

N_CONTEXTUAL_FRAUD = 15000 
N_USERS_TO_ATTACK = 500   
np.random.seed(45) 
random.seed(45) # Set random seed as well
attack_users = np.random.choice(df['UserID'].unique(), N_USERS_TO_ATTACK, replace=False)

fraud_data = []
for user_id in attack_users:
    city_A, city_B = random.sample(list(CITY_COORDS.keys()), 2)
    coord_A = CITY_COORDS[city_A]
    coord_B = CITY_COORDS[city_B]
    
    distance_km = haversine_distance(coord_A, coord_B)
    min_time_seconds = distance_km / MAX_SPEED_KMS 

    MIN_FRAUD_WAIT = 120

    max_fraud_time_sec = int(min(3600, min_time_seconds / 10)) 

    upper_bound_wait = max(MIN_FRAUD_WAIT, max_fraud_time_sec)

    user_txns = df[df['UserID'] == user_id]
    start_time = user_txns['Timestamp'].max() + timedelta(minutes=5) if not user_txns.empty else datetime(2025, 1, 15)

    txn_time_A = start_time + timedelta(seconds=random.randint(1, 60))
    category_A = 'Jewellery'
    amount_A = np.random.randint(low=5000, high=15000) 
    fraud_data.append([txn_time_A, user_id, amount_A, city_A, category_A, 3])

    txn_time_B = txn_time_A + timedelta(seconds=random.randint(100, upper_bound_wait)) 
    category_B = 'Cash_Advance'
    amount_B = np.random.randint(low=1000, high=5000)
    fraud_data.append([txn_time_B, user_id, amount_B, city_B, category_B, 3])

df_contextual_fraud = pd.DataFrame(fraud_data, columns=df.columns)

df_final = pd.concat([df, df_contextual_fraud], ignore_index=True)
df_final.sort_values(['UserID', 'Timestamp'], inplace=True)
df_final.reset_index(drop=True, inplace=True)

df = df_final
df[df['Fraud_Type'] == 3]

Unnamed: 0,Timestamp,UserID,Amount,City,Category,Fraud_Type
48,2025-01-30 14:16:28,1,11557,Mumbai,Jewellery,3
49,2025-01-30 14:18:52,1,3920,Ahmedabad,Cash_Advance,3
140,2025-01-30 13:53:45,4,10809,Delhi,Jewellery,3
141,2025-01-30 13:56:19,4,1111,Ahmedabad,Cash_Advance,3
163,2025-01-30 16:28:24,5,13048,Pune,Jewellery,3
...,...,...,...,...,...,...
33803,2025-01-29 23:00:55,994,3183,Mumbai,Cash_Advance,3
33921,2025-01-30 21:48:39,998,6574,Pune,Jewellery,3
33922,2025-01-30 21:52:36,998,3483,Kolkata,Cash_Advance,3
33998,2025-01-30 18:31:51,1000,5967,Delhi,Jewellery,3


In [98]:
N_BEHAVIORAL_FRAUD = 15000 
N_USERS_TO_ATTACK = 1000  
np.random.seed(46)
random.seed(46)
attack_users = np.random.choice(df['UserID'].unique(), N_USERS_TO_ATTACK, replace=False)
ALL_CATEGORIES = df['Category'].unique().tolist()

fraud_data_4 = []
for user_id in attack_users:
    user_txns = df[df['UserID'] == user_id]

    used_categories = user_txns['Category'].unique().tolist()
    rare_categories = [cat for cat in ALL_CATEGORIES if cat not in used_categories]

    if not rare_categories:
        category_counts = user_txns['Category'].value_counts()
        rare_category = category_counts.index[-1] 
    else:
        high_risk_rare_categories = list(set(rare_categories) & set(['Jewellery', 'Online_Subscription']))
        rare_category = random.choice(high_risk_rare_categories) if high_risk_rare_categories else random.choice(rare_categories)

    new_category = rare_category

    start_time = user_txns['Timestamp'].max() + timedelta(minutes=5) if not user_txns.empty else datetime(2025, 1, 15)
    txn_time = start_time + timedelta(seconds=random.randint(1, 120))
    city = user_txns['City'].iloc[-1] if not user_txns.empty else random.choice(CITIES)

    amount = np.random.randint(low=20000, high=50000)
    
    fraud_data_4.append([txn_time, user_id, amount, city, new_category, 4]) # 4 = Behavioral Fraud

df_behavioral_fraud = pd.DataFrame(fraud_data_4, columns=df.columns)
df_final = pd.concat([df, df_behavioral_fraud], ignore_index=True)


df_final.sort_values(['UserID', 'Timestamp'], inplace=True)
df_final.reset_index(drop=True, inplace=True)

df = df_final 


In [99]:
global_mean = df['Amount'].mean()
global_std = df['Amount'].std()

# ðŸ’¡ CHANGE: Using .transform() for simplicity
df['User_Mean_Amount'] = df.groupby('UserID')['Amount'].transform(
    lambda x: x.expanding().mean().shift(1)
)

# ðŸ’¡ CHANGE: Using .transform() for simplicity
df['User_Std_Amount'] = df.groupby('UserID')['Amount'].transform(
    lambda x: x.expanding().std().shift(1)
)

df['User_Mean_Amount'].fillna(global_mean, inplace=True)
df['User_Std_Amount'].fillna(global_std, inplace=True)

df['User_Std_Amount'].replace(0, 1, inplace=True) 

df['Amount_Z_Score'] = (df['Amount'] - df['User_Mean_Amount']) / df['User_Std_Amount']

df['Global_Z_Score'] = (df['Amount'] - global_mean) / global_std

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['User_Mean_Amount'].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['User_Std_Amount'].fillna(global_std, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [100]:
df['Time_Delta'] = df.groupby('UserID')['Timestamp'].diff()

df['Time_Since_Last_TXN_Sec'] = df['Time_Delta'].dt.total_seconds()

df['Time_Since_Last_TXN_Hrs'] = df['Time_Since_Last_TXN_Sec'] / 3600

MAX_TIME_DELTA = (df['Timestamp'].max() - df['Timestamp'].min()).total_seconds()
df['Time_Since_Last_TXN_Sec'].fillna(MAX_TIME_DELTA, inplace=True)
df['Time_Since_Last_TXN_Hrs'].fillna(MAX_TIME_DELTA / 3600, inplace=True)

df.drop('Time_Delta', axis=1, inplace=True)

print(df[['UserID', 'Timestamp', 'Amount', 'Fraud_Type', 'Time_Since_Last_TXN_Sec']].tail(10))


       UserID           Timestamp  Amount  Fraud_Type  Time_Since_Last_TXN_Sec
34990    1000 2025-01-25 23:30:07     155           0                  65166.0
34991    1000 2025-01-26 08:56:03     257           0                  33956.0
34992    1000 2025-01-27 11:50:33      89           0                  96870.0
34993    1000 2025-01-28 15:04:42     339           0                  98049.0
34994    1000 2025-01-28 15:15:08     519           0                    626.0
34995    1000 2025-01-29 21:01:46  612918           1                 107198.0
34996    1000 2025-01-30 18:25:56     206           0                  77050.0
34997    1000 2025-01-30 18:31:51    5967           3                    355.0
34998    1000 2025-01-30 18:37:24    4062           3                    333.0
34999    1000 2025-01-30 18:43:24   41298           4                    360.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Time_Since_Last_TXN_Sec'].fillna(MAX_TIME_DELTA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Time_Since_Last_TXN_Hrs'].fillna(MAX_TIME_DELTA / 3600, inplace=True)


In [101]:
CITY_COORDS = {
    'Mumbai': (19.0760, 72.8777),
    'Delhi': (28.7041, 77.1025),
    'Bangalore': (12.9716, 77.5946),
    'Chennai': (13.0827, 80.2707),
    'Kolkata': (22.5726, 88.3639),
    'Pune': (18.5204, 73.8567),
    'Hyderabad': (17.3850, 78.4867),
    'Ahmedabad': (23.0225, 72.5714)
}

MAX_SPEED_KMH = 1000
MAX_SPEED_KMS = MAX_SPEED_KMH / 3600

def haversine_distance(coord1, coord2):
    R = 6371
    lat1, lon1 = np.radians(coord1)
    lat2, lon2 = np.radians(coord2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    return distance

df['Prev_City'] = df.groupby('UserID')['City'].shift(1)
df['Prev_City'].fillna(df['City'], inplace=True)

df['Distance_Km'] = df.apply(
    lambda row: haversine_distance(CITY_COORDS[row['Prev_City']], CITY_COORDS[row['City']]),
    axis=1
)

df['Min_Travel_Time_Sec'] = df['Distance_Km'] / MAX_SPEED_KMS

epsilon = 1e-6
df['Geo_Velocity_Check'] = df['Min_Travel_Time_Sec'] / (df['Time_Since_Last_TXN_Sec'] + epsilon)

df.drop(['Prev_City'], axis=1, inplace=True)
print(df[['UserID', 'Timestamp', 'City', 'Time_Since_Last_TXN_Sec', 'Distance_Km', 'Geo_Velocity_Check']].tail(10))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Prev_City'].fillna(df['City'], inplace=True)


       UserID           Timestamp       City  Time_Since_Last_TXN_Sec  \
34990    1000 2025-01-25 23:30:07       Pune                  65166.0   
34991    1000 2025-01-26 08:56:03  Hyderabad                  33956.0   
34992    1000 2025-01-27 11:50:33    Kolkata                  96870.0   
34993    1000 2025-01-28 15:04:42      Delhi                  98049.0   
34994    1000 2025-01-28 15:15:08  Bangalore                    626.0   
34995    1000 2025-01-29 21:01:46  Bangalore                 107198.0   
34996    1000 2025-01-30 18:25:56    Kolkata                  77050.0   
34997    1000 2025-01-30 18:31:51      Delhi                    355.0   
34998    1000 2025-01-30 18:37:24       Pune                    333.0   
34999    1000 2025-01-30 18:43:24       Pune                    360.0   

       Distance_Km  Geo_Velocity_Check  
34990   505.754121            0.027940  
34991   505.754121            0.053620  
34992  1181.882100            0.043923  
34993  1317.753603            0.

In [102]:
df.reset_index(drop=True, inplace=True)

WINDOW_SIZE = '30min'

def calculate_lookback_count_values(group):
    group_indexed = group.set_index('Timestamp')
    result = group_indexed['Amount'].rolling(
        WINDOW_SIZE,
        closed='left'
    ).count().fillna(0)
    return result.values

df['Txn_Count_30_Min'] = df.groupby('UserID', group_keys=False).apply(
    calculate_lookback_count_values
).explode().values

df['Txn_Count_30_Min'] = df['Txn_Count_30_Min'].astype(float).fillna(0).astype(int)

print(df[['UserID', 'Timestamp', 'Fraud_Type', 'Time_Since_Last_TXN_Sec', 'Txn_Count_30_Min']].tail(10))

       UserID           Timestamp  Fraud_Type  Time_Since_Last_TXN_Sec  \
34990    1000 2025-01-25 23:30:07           0                  65166.0   
34991    1000 2025-01-26 08:56:03           0                  33956.0   
34992    1000 2025-01-27 11:50:33           0                  96870.0   
34993    1000 2025-01-28 15:04:42           0                  98049.0   
34994    1000 2025-01-28 15:15:08           0                    626.0   
34995    1000 2025-01-29 21:01:46           1                 107198.0   
34996    1000 2025-01-30 18:25:56           0                  77050.0   
34997    1000 2025-01-30 18:31:51           3                    355.0   
34998    1000 2025-01-30 18:37:24           3                    333.0   
34999    1000 2025-01-30 18:43:24           4                    360.0   

       Txn_Count_30_Min  
34990                 0  
34991                 0  
34992                 0  
34993                 0  
34994                 1  
34995                 0  
349

  df['Txn_Count_30_Min'] = df.groupby('UserID', group_keys=False).apply(


In [103]:
def calculate_category_score(group):
    cumulative_txn_count = pd.Series(range(len(group)), index=group.index).shift(1).fillna(0)

    group['Count'] = range(len(group))
    group['Category_Count'] = group.groupby('Category')['Count'].cumcount()
    group['Past_Category_Count'] = group['Category_Count'].shift(1).fillna(0)

    epsilon = 1e-6
    ratio = group['Past_Category_Count'] / (cumulative_txn_count + epsilon)

    return ratio

df['Category_Usage_Score'] = df.groupby('UserID', group_keys=False).apply(
    calculate_category_score
)

df['Category_Usage_Score'] = df['Category_Usage_Score'].clip(upper=1.0)

print(df[['UserID', 'Timestamp', 'Category', 'Fraud_Type', 'Category_Usage_Score']].tail(10))

       UserID           Timestamp      Category  Fraud_Type  \
34990    1000 2025-01-25 23:30:07       Grocery           0   
34991    1000 2025-01-26 08:56:03     Utilities           0   
34992    1000 2025-01-27 11:50:33        Travel           0   
34993    1000 2025-01-28 15:04:42        Travel           0   
34994    1000 2025-01-28 15:15:08     Utilities           0   
34995    1000 2025-01-29 21:01:46        Travel           1   
34996    1000 2025-01-30 18:25:56        Luxury           0   
34997    1000 2025-01-30 18:31:51     Jewellery           3   
34998    1000 2025-01-30 18:37:24  Cash_Advance           3   
34999    1000 2025-01-30 18:43:24  Cash_Advance           4   

       Category_Usage_Score  
34990              0.476190  
34991              0.500000  
34992              0.304348  
34993              0.083333  
34994              0.120000  
34995              0.307692  
34996              0.148148  
34997              0.035714  
34998              0.000000  
34999 

  df['Category_Usage_Score'] = df.groupby('UserID', group_keys=False).apply(


In [104]:
categorical_cols = ['City', 'Category']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print(f"New DataFrame shape: {df.shape}")
print(df.head()[['City_Delhi', 'Category_Jewellery', 'Amount', 'Fraud_Type']].head())

New DataFrame shape: (35000, 28)
   City_Delhi  Category_Jewellery  Amount  Fraud_Type
0        True               False  754564           1
1       False               False  426647           1
2       False               False  102941           1
3       False               False  498411           1
4       False               False  195041           1


In [105]:
from sklearn.preprocessing import StandardScaler

numerical_features = [
    'Amount',
    'User_Mean_Amount',
    'User_Std_Amount',
    'Time_Since_Last_TXN_Sec',
    'Time_Since_Last_TXN_Hrs',
    'Amount_Z_Score',
    'Geo_Velocity_Check',
    'Txn_Count_30_Min',
    'Category_Usage_Score'
]

numerical_cols_to_scale = [col for col in numerical_features if col in df.columns]

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['Fraud_Type', 'Timestamp', 'UserID', 'TransactionID'], errors='ignore')
y = df['Fraud_Type']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
if len(numerical_cols_to_scale) > 0:
    X_train.loc[:, numerical_cols_to_scale] = scaler.fit_transform(X_train[numerical_cols_to_scale])
    X_test.loc[:, numerical_cols_to_scale] = scaler.transform(X_test[numerical_cols_to_scale])

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (28000, 25)
X_test shape: (7000, 25)


 -0.73827039]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, numerical_cols_to_scale] = scaler.fit_transform(X_train[numerical_cols_to_scale])
 -0.33110092]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  X_train.loc[:, numerical_cols_to_scale] = scaler.fit_transform(X_train[numerical_cols_to_scale])
 -0.74365739]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_test.loc[:, numerical_cols_to_scale] = scaler.transform(X_test[numerical_cols_to_scale])
 -0.33110092]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  X_test.loc[:, numerical_cols_to_scale] = scaler.transform(X_test[numerical_cols_to_scale])


In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GroupKFold, GridSearchCV
import joblib

groups = df.loc[X_train.index, 'UserID']

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1,
    bootstrap=True,
    oob_score=True
)

param_grid = {
    'max_depth': [6, 8, 10],
    'min_samples_leaf': [5, 10, 20],
    'min_samples_split': [10, 20, 40],
    'max_features': ['sqrt', 0.5]
}

cv = GroupKFold(n_splits=5)
grid = GridSearchCV(
    rf,
    param_grid,
    cv=cv,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=0
)
grid.fit(X_train, y_train, groups=groups)

rf_model = grid.best_estimator_
rf_model.fit(X_train, y_train)

joblib.dump(rf_model, 'rf_model.pkl')

y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

print(classification_report(y_test, y_pred, target_names=[
    'Normal (0)', 'Type 1 Fraud (1)', 'Type 2 Fraud (2)', 'Type 3 Fraud (3)', 'Type 4 Fraud (4)'
]))
auc_ovr = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')
print(f"Weighted Multi-Class AUC-ROC: {auc_ovr:.4f}")

                  precision    recall  f1-score   support

      Normal (0)       1.00      1.00      1.00      3000
Type 1 Fraud (1)       1.00      1.00      1.00      3000
Type 2 Fraud (2)       1.00      1.00      1.00       600
Type 3 Fraud (3)       0.99      1.00      0.99       200
Type 4 Fraud (4)       1.00      1.00      1.00       200

        accuracy                           1.00      7000
       macro avg       1.00      1.00      1.00      7000
    weighted avg       1.00      1.00      1.00      7000

Weighted Multi-Class AUC-ROC: 1.0000


In [108]:
# Train Random Forest (from your existing data - run this first)
import joblib
rf_model = joblib.load('rf_model.pkl')
df[df['Fraud_Type'] == 0]

Unnamed: 0,Timestamp,UserID,Amount,Fraud_Type,User_Mean_Amount,User_Std_Amount,Amount_Z_Score,Global_Z_Score,Time_Since_Last_TXN_Sec,Time_Since_Last_TXN_Hrs,...,City_Hyderabad,City_Kolkata,City_Mumbai,City_Pune,Category_Electronics,Category_Grocery,Category_Jewellery,Category_Luxury,Category_Travel,Category_Utilities
7,2025-01-07 09:33:27,1,277,0,412284.571429,214298.095805,-1.922591,-0.743629,21095.0,5.859722,...,False,False,False,False,False,False,False,False,False,True
10,2025-01-08 13:47:52,1,44,0,374271.100000,218929.625862,-1.709349,-0.744355,16938.0,4.705000,...,False,False,False,False,False,True,False,False,False,False
11,2025-01-08 17:43:07,1,99,0,340250.454545,236365.417226,-1.439091,-0.744184,14115.0,3.920833,...,False,True,False,False,False,False,False,False,False,True
16,2025-01-13 10:43:59,1,27,0,381869.937500,251657.611662,-1.517311,-0.744408,25485.0,7.079167,...,True,False,False,False,False,False,False,False,True,False
17,2025-01-13 23:16:52,1,143,0,359408.588235,260672.282755,-1.378227,-0.744047,45173.0,12.548056,...,True,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34991,2025-01-26 08:56:03,1000,257,0,199125.478261,328818.339808,-0.604797,-0.743691,33956.0,9.432222,...,True,False,False,False,False,False,False,False,False,True
34992,2025-01-27 11:50:33,1000,89,0,190839.291667,324142.603038,-0.588476,-0.744215,96870.0,26.908333,...,False,True,False,False,False,False,False,False,True,False
34993,2025-01-28 15:04:42,1000,339,0,183209.280000,319602.883163,-0.572180,-0.743436,98049.0,27.235833,...,False,False,False,False,False,False,False,False,True,False
34994,2025-01-28 15:15:08,1000,519,0,176175.807692,315192.601212,-0.557300,-0.742875,626.0,0.173889,...,False,False,False,False,False,False,False,False,False,True


In [109]:
import pandas as pd
import joblib
import numpy as np

# Load trained model
rf_model = joblib.load('rf_model.pkl')

# Load YOUR CSV (replace with actual filename, e.g., 'test_transactions.csv')
test_df = pd.read_csv('live_test_data.csv')  # CHANGE THIS PATH
print(f"Test data loaded: {test_df.shape}")
print(test_df.head())
print("\nColumn names:", test_df.columns.tolist())


Test data loaded: (520, 5)
             Timestamp  UserID  Amount       City     Category
0  2025-03-04 13:10:08    1001     202  Ahmedabad       Luxury
1  2025-03-07 15:56:17    1001      28  Ahmedabad       Travel
2  2025-03-01 09:06:36    1002     101  Ahmedabad  Electronics
3  2025-03-01 11:54:48    1002      78    Chennai      Grocery
4  2025-03-05 06:24:37    1002      59     Mumbai    Utilities

Column names: ['Timestamp', 'UserID', 'Amount', 'City', 'Category']


In [110]:
from datetime import datetime
from sklearn.preprocessing import StandardScaler

# CITYCOORDS from your notebook
CITYCOORDS = {
    'Mumbai': (19.0760, 72.8777), 'Delhi': (28.7041, 77.1025), 
    'Bangalore': (12.9716, 77.5946), 'Chennai': (13.0827, 80.2707),
    'Kolkata': (22.5726, 88.3639), 'Pune': (18.5204, 73.8567),
    'Hyderabad': (17.3850, 78.4867), 'Ahmedabad': (23.0225, 72.5714)
}

def haversine_distance(coord1, coord2, R=6371):
    lat1, lon1 = np.radians(coord1)
    lat2, lon2 = np.radians(coord2)
    dlon, dlat = lon2 - lon1, lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# Fix column names (handle spaces/underscores)
test_df.columns = test_df.columns.str.strip().str.replace(' ', '_').str.replace('-', '_')

# Ensure required columns exist
required_cols = ['Timestamp', 'UserID', 'Amount', 'City', 'Category']
missing_cols = [col for col in required_cols if col not in test_df.columns]
if missing_cols:
    print(f"Warning: Missing columns: {missing_cols}")
    print("Available columns:", test_df.columns.tolist())

# Convert and sort
test_df['Timestamp'] = pd.to_datetime(test_df['Timestamp'])
test_df.sort_values(['UserID', 'Timestamp'], inplace=True)
test_df.reset_index(drop=True, inplace=True)

# Feature engineering (FIXED: no inplace warnings)
global_mean = test_df['Amount'].mean()
global_std = test_df['Amount'].std()

test_df['UserMeanAmount'] = test_df.groupby('UserID')['Amount'].expanding().mean().shift(1).reset_index(0, drop=True)
test_df['UserStdAmount'] = test_df.groupby('UserID')['Amount'].expanding().std().shift(1).reset_index(0, drop=True)
test_df['UserMeanAmount'] = test_df['UserMeanAmount'].fillna(global_mean)
test_df['UserStdAmount'] = test_df['UserStdAmount'].fillna(global_std).replace(0, 1)
test_df['AmountZScore'] = (test_df['Amount'] - test_df['UserMeanAmount']) / test_df['UserStdAmount']

# Time features
test_df['TimeDelta'] = test_df.groupby('UserID')['Timestamp'].diff()
test_df['TimeSinceLastTXNSec'] = test_df['TimeDelta'].dt.total_seconds()
test_df['TimeSinceLastTXNHrs'] = test_df['TimeSinceLastTXNSec'] / 3600
max_time_delta = (test_df['Timestamp'].max() - test_df['Timestamp'].min()).total_seconds()
test_df['TimeSinceLastTXNSec'] = test_df['TimeSinceLastTXNSec'].fillna(max_time_delta)
test_df['TimeSinceLastTXNHrs'] = test_df['TimeSinceLastTXNHrs'].fillna(max_time_delta/3600)
test_df.drop('TimeDelta', axis=1, inplace=True)

# Geo features
test_df['PrevCity'] = test_df.groupby('UserID')['City'].shift(1).fillna(test_df['City'])
test_df['DistanceKm'] = test_df.apply(
    lambda row: haversine_distance(CITYCOORDS.get(row['PrevCity'], (0,0)), 
                                   CITYCOORDS.get(row['City'], (0,0))), axis=1)
MAX_SPEED_KMS = 1000 / 3.6
test_df['MinTravelTimeSec'] = test_df['DistanceKm'] / MAX_SPEED_KMS
test_df['GeoVelocityCheck'] = test_df['MinTravelTimeSec'] < (test_df['TimeSinceLastTXNSec'] + 1e-6)
test_df.drop('PrevCity', axis=1, inplace=True)

# Rolling count (FIXED deprecation warning)
def calculate_lookback_count(group):
    group_indexed = group.set_index('Timestamp')
    result = group_indexed['Amount'].rolling('30min', closed='left').count().fillna(0)
    return result.values

test_df['TxnCount30Min'] = test_df.groupby('UserID', group_keys=False).apply(
    calculate_lookback_count, include_groups=False).explode().values
test_df['TxnCount30Min'] = pd.to_numeric(test_df['TxnCount30Min'], errors='coerce').fillna(0).astype(int)

# Category score (FIXED)
def calculate_category_score(group):
    cumulative_txn_count = pd.Series(range(len(group)), index=group.index).shift(1).fillna(0)
    group_category_count = group.groupby('Category').cumcount()
    group_past_category_count = group_category_count.shift(1).fillna(0)
    return group_past_category_count / (cumulative_txn_count + 1e-6)

test_df['CategoryUsageScore'] = test_df.groupby('UserID', group_keys=False).apply(
    calculate_category_score, include_groups=False)
test_df['CategoryUsageScore'] = test_df['CategoryUsageScore'].clip(upper=1.0)

test_df['City_raw'] = test_df['City']
test_df['Category_raw'] = test_df['Category']

# One-hot encoding
test_df = pd.get_dummies(test_df, columns=['City', 'Category'], drop_first=True)

print("Features ready. Shape:", test_df.shape)
test_df


Features ready. Shape: (520, 27)


Unnamed: 0,Timestamp,UserID,Amount,UserMeanAmount,UserStdAmount,AmountZScore,TimeSinceLastTXNSec,TimeSinceLastTXNHrs,DistanceKm,MinTravelTimeSec,...,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai,City_Pune,Category_Electronics,Category_Grocery,Category_Luxury,Category_Travel,Category_Utilities
0,2025-03-04 13:10:08,1001,202,14050.888462,101425.047547,-0.136543,629398.0,174.832778,0.000000,0.000000,...,False,False,False,False,False,False,False,True,False,False
1,2025-03-07 15:56:17,1001,28,202.000000,101425.047547,-0.001716,269169.0,74.769167,0.000000,0.000000,...,False,False,False,False,False,False,False,False,True,False
2,2025-03-01 09:06:36,1002,101,115.000000,123.036580,-0.113787,629398.0,174.832778,0.000000,0.000000,...,False,False,False,False,False,True,False,False,False,False
3,2025-03-01 11:54:48,1002,78,101.000000,101425.047547,-0.000227,10092.0,2.803333,1371.808916,4.938512,...,False,False,False,False,False,False,True,False,False,False
4,2025-03-05 06:24:37,1002,59,89.500000,16.263456,-1.875370,325789.0,90.496944,1033.098512,3.719155,...,False,False,False,True,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,2025-03-03 06:39:19,1198,60,130.000000,49.497475,-1.414214,55886.0,15.523889,0.000000,0.000000,...,False,False,False,False,True,False,True,False,False,False
516,2025-03-03 19:49:51,1198,350,106.666667,53.463383,4.551402,47432.0,13.175556,0.000000,0.000000,...,False,False,False,False,True,False,False,False,False,False
517,2025-03-04 14:02:54,1198,96,167.500000,129.260718,-0.553146,65583.0,18.217500,518.126083,1.865254,...,False,False,False,False,False,False,False,False,True,False
518,2025-03-04 13:11:54,1199,51,153.200000,116.420359,-0.877853,629398.0,174.832778,0.000000,0.000000,...,False,False,False,False,False,True,False,False,False,False


In [None]:
def rolling_count_60(group):
    gi = group.set_index('Timestamp')
    r = gi['Amount'].rolling('60s', closed='both').count().fillna(0)
    return r.values

test_df['TxnCount60Sec'] = test_df.groupby('UserID', group_keys=False).apply(rolling_count_60, include_groups=False).explode().values
prev_city = test_df.groupby('UserID')['City_raw'].shift(1).fillna(test_df['City_raw'])
def chain_burst(group, threshold_sec):
    d = group['TimeSinceLastTXNSec'].values
    chain = np.zeros_like(d, dtype=int)
    c = 1
    for i in range(len(d)):
        if np.isnan(d[i]) or d[i] > threshold_sec:
            c = 1
        else:
            c += 1
        chain[i] = c
    return pd.Series(chain, index=group.index)
test_df['BurstChain60'] = test_df.groupby('UserID').apply(lambda g: chain_burst(g, 60)).reset_index(level=0, drop=True)
test_df['BurstChain30'] = test_df.groupby('UserID').apply(lambda g: chain_burst(g, 30)).reset_index(level=0, drop=True)
hv = test_df['Amount'] > 100000
diff20 = test_df['TimeSinceLastTXNSec'] <= 20
vel = (test_df['TxnCount60Sec'] >= 5) | diff20 | (test_df['BurstChain30'] >= 2) | (test_df['BurstChain60'] >= 5)
fast = (prev_city != test_df['City_raw']) & (test_df['TimeSinceLastTXNSec'] <= 300) & (test_df['DistanceKm'] >= 200)
test_df['Rule_HighValue'] = hv
test_df['Rule_Velocity'] = vel
test_df['Rule_FastLocationSwitch_5m'] = fast
test_df['Rule_Fraud'] = hv | vel | fast
def reason_row(r):
    lst = []
    if r['Rule_HighValue']:
        lst.append('HighValue')
    if r['Rule_Velocity']:
        lst.append('Velocity')
    if r['Rule_FastLocationSwitch_5m']:
        lst.append('FastLocation5m')
    return '+'.join(lst)
test_df['Rule_Reason'] = test_df.apply(reason_row, axis=1)
flagged = test_df[test_df['Rule_Fraud']]
print(f"Flagged transactions: {flagged.shape[0]}")
print(f"HighValue count: {flagged['Rule_HighValue'].sum()}")
print(f"Velocity count: {flagged['Rule_Velocity'].sum()}")
print(f"FastLocation5m count: {flagged['Rule_FastLocationSwitch_5m'].sum()}")
cols = ['UserID','Amount','Timestamp','City_raw','Rule_Reason','TxnCount60Sec','TimeSinceLastTXNSec','DistanceKm']
cols = [c for c in cols if c in flagged.columns]
print(flagged[cols].head(20).to_string(index=False))
flagged[cols].to_csv('RULE_BASED_FRAUDS_520TEST.csv', index=False)
print("Saved: RULE_BASED_FRAUDS_520TEST.csv")
   

Flagged transactions: 17
HighValue count: 10
Velocity count: 6
FastLocation5m count: 4
 UserID  Amount           Timestamp  City_raw             Rule_Reason TxnCount60Sec  TimeSinceLastTXNSec  DistanceKm
   1004      10 2025-03-05 00:18:39 Hyderabad          FastLocation5m           1.0                286.0  621.462850
   1011  534472 2025-03-08 01:52:00      Pune               HighValue           1.0             629398.0    0.000000
   1017  912546 2025-03-08 02:22:00    Mumbai               HighValue           1.0              15654.0  439.979823
   1029     238 2025-03-08 07:16:05   Kolkata                Velocity           2.0                  5.0    0.000000
   1029     258 2025-03-08 07:16:10   Kolkata                Velocity           3.0                  5.0    0.000000
   1029      97 2025-03-08 07:16:15   Kolkata                Velocity           4.0                  5.0    0.000000
   1065    3437 2025-03-08 04:53:07   Chennai Velocity+FastLocation5m           2.0           

  test_df['BurstChain60'] = test_df.groupby('UserID').apply(lambda g: chain_burst(g, 60)).reset_index(level=0, drop=True)
  test_df['BurstChain30'] = test_df.groupby('UserID').apply(lambda g: chain_burst(g, 30)).reset_index(level=0, drop=True)
