In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load dataset
df = pd.read_csv("AIML Dataset.csv")

In [3]:
# Reduce number of rows to 1M rows
df = df.sample(n=1_000_000, random_state=42).reset_index(drop=True)

In [14]:
# Map transaction_type
type_mapping = {
    "PAYMENT": "Payment",
    "CASH_OUT": "Withdrawal",
    "TRANSFER": "Transfer",
    "CASH_IN": "Deposit",
    "DEBIT": "Debit"
}
df["transaction_type"] = df["type"].map(type_mapping)

In [5]:
channel_probs = {
    "web": 0.30,
    "mobile_app": 0.40,
    "atm": 0.10,
    "branch": 0.05,
    "phone_banking": 0.05,
    "pos_terminal": 0.10
}

df["channel"] = np.random.choice(
    list(channel_probs.keys()),
    size=len(df),
    p=list(channel_probs.values())
)


In [6]:
# Add transaction_time from step
start_date = pd.to_datetime("2025-01-01")  
df["transaction_time"] = start_date + pd.to_timedelta(df["step"], unit="h")

In [7]:
# --- Add time_since_last_transaction ---
# (assuming 'nameOrig' = customer ID in the Kaggle dataset)
df = df.sort_values(by=["nameOrig", "transaction_time"])
df["time_since_last_transaction"] = df.groupby("nameOrig")["transaction_time"].diff().dt.total_seconds() / 3600.0  # hours

# Fill NaN for first transaction per customer
df["time_since_last_transaction"] = df["time_since_last_transaction"].fillna(0)

# Save
#df.to_csv("modified_fraud_dataset.csv", index=False)
#print(f"Modified dataset created with {len(df)} rows and new features")


In [8]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,transaction_type,payment_method,transaction_time,time_since_last_transaction
785084,217,PAYMENT,3170.28,C1000001337,58089.00,54918.72,M216466820,0.00,0.00,0,0,Purchase,bank_transfer,2025-01-10 01:00:00,0.0
54986,19,PAYMENT,3849.38,C1000009135,103120.55,99271.17,M2120765976,0.00,0.00,0,0,Purchase,credit_card,2025-01-01 19:00:00,0.0
697666,302,PAYMENT,17686.93,C100001401,104117.89,86430.95,M70139133,0.00,0.00,0,0,Purchase,debit_card,2025-01-13 14:00:00,0.0
83576,163,CASH_OUT,77027.49,C1000015836,19787.00,0.00,C941241676,0.00,77027.49,0,0,Withdrawal,credit_card,2025-01-07 19:00:00,0.0
113113,96,CASH_OUT,290045.17,C1000018217,0.00,0.00,C187378283,914750.06,1204795.23,0,0,Withdrawal,mobile_wallet,2025-01-05 00:00:00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136682,309,CASH_OUT,154530.72,C999989234,0.00,0.00,C1268024452,619404.86,773935.58,0,0,Withdrawal,crypto,2025-01-13 21:00:00,0.0
339068,227,CASH_OUT,444881.06,C999990251,0.00,0.00,C322679809,654370.05,1099251.11,0,0,Withdrawal,credit_card,2025-01-10 11:00:00,0.0
984348,398,CASH_IN,77250.43,C999991491,1357253.07,1434503.49,C931198233,1800518.14,1723267.71,0,0,Deposit,debit_card,2025-01-17 14:00:00,0.0
963710,307,PAYMENT,61919.08,C99999367,86524.98,24605.90,M1618919399,0.00,0.00,0,0,Purchase,credit_card,2025-01-13 19:00:00,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 785084 to 393253
Data columns (total 15 columns):
 #   Column                       Non-Null Count    Dtype         
---  ------                       --------------    -----         
 0   step                         1000000 non-null  int64         
 1   type                         1000000 non-null  object        
 2   amount                       1000000 non-null  float64       
 3   nameOrig                     1000000 non-null  object        
 4   oldbalanceOrg                1000000 non-null  float64       
 5   newbalanceOrig               1000000 non-null  float64       
 6   nameDest                     1000000 non-null  object        
 7   oldbalanceDest               1000000 non-null  float64       
 8   newbalanceDest               1000000 non-null  float64       
 9   isFraud                      1000000 non-null  int64         
 10  isFlaggedFraud               1000000 non-null  int64         
 11  transac

In [8]:
df.to_csv("fraud_dataset.csv", index=False)


In [11]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,transaction_type,payment_method,transaction_time,time_since_last_transaction,is_new_device
0,217,PAYMENT,3170.28,C1000001337,58089.00,54918.72,M216466820,0.00,0.00,0,0,Purchase,bank_transfer,2025-01-10 01:00:00,0.0,1
1,19,PAYMENT,3849.38,C1000009135,103120.55,99271.17,M2120765976,0.00,0.00,0,0,Purchase,credit_card,2025-01-01 19:00:00,0.0,1
2,302,PAYMENT,17686.93,C100001401,104117.89,86430.95,M70139133,0.00,0.00,0,0,Purchase,debit_card,2025-01-13 14:00:00,0.0,1
3,163,CASH_OUT,77027.49,C1000015836,19787.00,0.00,C941241676,0.00,77027.49,0,0,Withdrawal,credit_card,2025-01-07 19:00:00,0.0,1
4,96,CASH_OUT,290045.17,C1000018217,0.00,0.00,C187378283,914750.06,1204795.23,0,0,Withdrawal,mobile_wallet,2025-01-05 00:00:00,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,309,CASH_OUT,154530.72,C999989234,0.00,0.00,C1268024452,619404.86,773935.58,0,0,Withdrawal,crypto,2025-01-13 21:00:00,0.0,1
999996,227,CASH_OUT,444881.06,C999990251,0.00,0.00,C322679809,654370.05,1099251.11,0,0,Withdrawal,credit_card,2025-01-10 11:00:00,0.0,1
999997,398,CASH_IN,77250.43,C999991491,1357253.07,1434503.49,C931198233,1800518.14,1723267.71,0,0,Deposit,debit_card,2025-01-17 14:00:00,0.0,1
999998,307,PAYMENT,61919.08,C99999367,86524.98,24605.90,M1618919399,0.00,0.00,0,0,Purchase,credit_card,2025-01-13 19:00:00,0.0,1


In [28]:
df["is_new_device"].value_counts()

0    950000
1     50000
Name: is_new_device, dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 16 columns):
 #   Column                       Non-Null Count    Dtype         
---  ------                       --------------    -----         
 0   step                         1000000 non-null  int64         
 1   type                         1000000 non-null  object        
 2   amount                       1000000 non-null  float64       
 3   nameOrig                     1000000 non-null  object        
 4   oldbalanceOrg                1000000 non-null  float64       
 5   newbalanceOrig               1000000 non-null  float64       
 6   nameDest                     1000000 non-null  object        
 7   oldbalanceDest               1000000 non-null  float64       
 8   newbalanceDest               1000000 non-null  float64       
 9   isFraud                      1000000 non-null  int64         
 10  isFlaggedFraud               1000000 non-null  int64         
 11  transaction_

In [15]:
# Save
df.to_csv("fraud_dataset.csv", index=False)

In [14]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,transaction_type,payment_method,transaction_time,time_since_last_transaction,is_new_device
0,217,PAYMENT,3170.28,C1000001337,58089.0,54918.72,M216466820,0.0,0.0,0,0,Purchase,bank_transfer,2025-01-10 01:00:00,0.0,1
1,19,PAYMENT,3849.38,C1000009135,103120.55,99271.17,M2120765976,0.0,0.0,0,0,Purchase,credit_card,2025-01-01 19:00:00,0.0,1
2,302,PAYMENT,17686.93,C100001401,104117.89,86430.95,M70139133,0.0,0.0,0,0,Purchase,debit_card,2025-01-13 14:00:00,0.0,1
3,163,CASH_OUT,77027.49,C1000015836,19787.0,0.0,C941241676,0.0,77027.49,0,0,Withdrawal,credit_card,2025-01-07 19:00:00,0.0,1
4,96,CASH_OUT,290045.17,C1000018217,0.0,0.0,C187378283,914750.06,1204795.23,0,0,Withdrawal,mobile_wallet,2025-01-05 00:00:00,0.0,1


In [17]:
df["nameOrig"].duplicated().sum()

231

In [8]:
# Start fresh
df["is_new_device"] = 0

# 1. Keep repeat-customer logic (2 txns → both 0, since no >2 exist in your data)
# Already done since no customer >2

# 2. Check current ratio
target_ratio = 0.05
current_ratio = df["is_new_device"].mean()
needed = int((target_ratio - current_ratio) * len(df))

print(f"Currently {current_ratio:.4%}, need to flip {needed} rows to reach 5%.")

if needed > 0:
    # Prefer fraudulent transactions first
    fraud_candidates = df[(df["isFraud"] == 1) & (df["is_new_device"] == 0)]
    n_fraud = min(len(fraud_candidates), needed)
    extra_fraud = fraud_candidates.sample(n_fraud, random_state=42).index
    df.loc[extra_fraud, "is_new_device"] = 1
    
    # If still need more, sprinkle among non-fraud
    still_needed = needed - n_fraud
    if still_needed > 0:
        nonfraud_candidates = df[(df["isFraud"] == 0) & (df["is_new_device"] == 0)]
        extra_nonfraud = nonfraud_candidates.sample(still_needed, random_state=42).index
        df.loc[extra_nonfraud, "is_new_device"] = 1

# Final ratio
print("Final ratio:", df["is_new_device"].mean())


Currently 0.0000%, need to flip 50000 rows to reach 5%.
Final ratio: 0.05


In [32]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,transaction_type,payment_method,transaction_time,time_since_last_transaction,is_new_device,amount_fraction_of_balance,is_new_destination
0,217.0,PAYMENT,3170.28,C1000001337,58089.00,54918.72,M216466820,0.00,0.00,0.0,0.0,Purchase,bank_transfer,2025-01-10 01:00:00,0.0,1,0.054576,1
1,19.0,PAYMENT,3849.38,C1000009135,103120.55,99271.17,M2120765976,0.00,0.00,0.0,0.0,Purchase,credit_card,2025-01-01 19:00:00,0.0,0,0.037329,1
2,302.0,PAYMENT,17686.93,C100001401,104117.89,86430.95,M70139133,0.00,0.00,0.0,0.0,Purchase,debit_card,2025-01-13 14:00:00,0.0,0,0.169874,1
3,163.0,CASH_OUT,77027.49,C1000015836,19787.00,0.00,C941241676,0.00,77027.49,0.0,0.0,Withdrawal,credit_card,2025-01-07 19:00:00,0.0,0,1.000000,1
4,96.0,CASH_OUT,290045.17,C1000018217,0.00,0.00,C187378283,914750.06,1204795.23,0.0,0.0,Withdrawal,mobile_wallet,2025-01-05 00:00:00,0.0,0,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,309.0,CASH_OUT,154530.72,C999989234,0.00,0.00,C1268024452,619404.86,773935.58,0.0,0.0,Withdrawal,crypto,2025-01-13 21:00:00,0.0,0,0.000000,1
999996,227.0,CASH_OUT,444881.06,C999990251,0.00,0.00,C322679809,654370.05,1099251.11,0.0,0.0,Withdrawal,credit_card,2025-01-10 11:00:00,0.0,0,0.000000,1
999997,398.0,CASH_IN,77250.43,C999991491,1357253.07,1434503.49,C931198233,1800518.14,1723267.71,0.0,0.0,Deposit,debit_card,2025-01-17 14:00:00,0.0,1,0.056917,1
999998,307.0,PAYMENT,61919.08,C99999367,86524.98,24605.90,M1618919399,0.00,0.00,0.0,0.0,Purchase,credit_card,2025-01-13 19:00:00,0.0,0,0.715621,1


In [9]:
import pandas as pd
import numpy as np

# --- Feature 1: Amount fraction of origin balance ---
df["amount_fraction_of_balance"] = np.where(
    df["oldbalanceOrg"] > 0,
    df["amount"] / df["oldbalanceOrg"],
    0
)
df["amount_fraction_of_balance"] = df["amount_fraction_of_balance"].clip(0, 1)  # cap at 1

# --- Feature 2: Is new destination ---
df = df.sort_values(by=["nameOrig", "transaction_time"]).reset_index(drop=True)

# Initialize column
df["is_new_destination"] = 0
seen_pairs = set()

for i, row in df.iterrows():
    pair = (row["nameOrig"], row["nameDest"])
    if pair not in seen_pairs:
        df.at[i, "is_new_destination"] = 1
        seen_pairs.add(pair)


In [11]:
df["is_new_destination"].value_counts()

1    1000000
Name: is_new_destination, dtype: int64

In [10]:
# Default everyone to 1 attempt
df["LoginAttempts"] = 1  

# Fraudulent transactions → more attempts
mask_fraud = df["isFraud"] == 1
df.loc[mask_fraud, "LoginAttempts"] = np.random.choice(
    [1, 2, 3, 4, 5],
    size=mask_fraud.sum(),
    p=[0.5, 0.2, 0.15, 0.1, 0.05]
)

# Non-fraudulent transactions → mostly 1, but a few with 2–3
mask_nonfraud = df["isFraud"] == 0
df.loc[mask_nonfraud, "LoginAttempts"] = np.random.choice(
    [1, 2, 3],
    size=mask_nonfraud.sum(),
    p=[0.9, 0.08, 0.02]   # 90% single attempt, 10% multiple
)


In [13]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud', 'transaction_type', 'channel', 'transaction_time',
       'time_since_last_transaction', 'is_new_device',
       'amount_fraction_of_balance', 'is_new_destination', 'LoginAttempts'],
      dtype='object')