## Step 1 – Install Required Libraries (run only once)

In [1]:
!pip install lightgbm imbalanced-learn shap
# Expected: installs packages, no output except install logs




## Step 2 – Load Dataset Sample

In [2]:
import pandas as pd

DATA_PATH = "Fraud.csv"   # Make sure the file is in the same folder as this notebook
SAMPLE_N = 1_000_000      # Adjust if RAM is limited

df = pd.read_csv(DATA_PATH, nrows=SAMPLE_N)
df.shape
# Expected sample output: (1000000, 11)


(1000000, 11)

## Step 3 – Feature Engineering

In [3]:
df['orig_type'] = df['nameOrig'].str[0]
df['dest_type'] = df['nameDest'].str[0]
df['orig_balance_diff'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['dest_balance_diff'] = df['newbalanceDest'] - df['oldbalanceDest']
df['orig_no_change'] = ((df['amount']>0) & (df['orig_balance_diff']==0)).astype(int)
df['dest_no_change'] = ((df['amount']>0) & (df['dest_balance_diff']==0)).astype(int)
df = df.drop(columns=['nameOrig','nameDest'])

df.head()
# Expected: First 5 rows with new engineered columns


Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,orig_type,dest_type,orig_balance_diff,dest_balance_diff,orig_no_change,dest_no_change
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0,C,M,9839.64,0.0,0,1
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0,C,M,1864.28,0.0,0,1
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0,C,C,181.0,0.0,0,1
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0,C,C,181.0,-21182.0,0,0
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0,C,M,11668.14,0.0,0,1


## Step 4 – Prepare Features and Target

In [4]:
X = df.drop(columns=['isFraud','isFlaggedFraud','step'])
y = df['isFraud']

X = pd.get_dummies(X, columns=['type','orig_type','dest_type'], drop_first=True)
X = X.fillna(0)

X.shape, y.value_counts()
# Expected: shape of X and count of fraud(1)/non-fraud(0) cases


((1000000, 14),
 isFraud
 0    999465
 1       535
 Name: count, dtype: int64)

## Step 5 – Train/Test Split + Handle Imbalance

In [5]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

X_res.shape, y_res.sum(), y_train.sum()
# Expected: balanced training set after SMOTE


((1599144, 14), np.int64(799572), np.int64(428))

## Step 6 – Train LightGBM Model

In [6]:
import lightgbm as lgb

params = {
    'objective':'binary',
    'metric':'auc',
    'learning_rate':0.05,
    'num_leaves':64,
    'verbose':-1,
    'seed':42
}

dtrain = lgb.Dataset(X_res, label=y_res)
dvalid = lgb.Dataset(X_test, label=y_test, reference=dtrain)

clf = lgb.train(
    params, dtrain,
    num_boost_round=1000,
    valid_sets=[dtrain, dvalid],
    early_stopping_rounds=50,
    verbose_eval=50
)
# Expected: LightGBM training log with AUC values per iteration




Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.999221	valid_1's auc: 0.987783
Early stopping, best iteration is:
[42]	training's auc: 0.999136	valid_1's auc: 0.992766


## Step 7 – Evaluate Model

In [7]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, classification_report, confusion_matrix, auc

y_prob = clf.predict(X_test)
y_pred = (y_prob >= 0.5).astype(int)

roc = roc_auc_score(y_test, y_prob)
precision, recall, _ = precision_recall_curve(y_test, y_prob)
pr_auc = auc(recall, precision)

print("ROC-AUC:", roc)
print("PR-AUC:", pr_auc)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Expected: ROC-AUC ~0.98+, PR-AUC high, Confusion matrix with low false negatives, report with precision/recall for class 1


ROC-AUC: 0.9927657090936175
PR-AUC: 0.45945463425685984
Confusion Matrix:
[[197451   2442]
 [     4    103]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    199893
           1       0.04      0.96      0.08       107

    accuracy                           0.99    200000
   macro avg       0.52      0.98      0.54    200000
weighted avg       1.00      0.99      0.99    200000



## Step 8 – Top 10 Feature Importances

In [8]:
imp = pd.DataFrame({'feature':clf.feature_name(), 'importance':clf.feature_importance()})
imp = imp.sort_values('importance', ascending=False).reset_index(drop=True)
imp.head(10)
# Expected: table of top 10 features by importance


Unnamed: 0,feature,importance
0,amount,493
1,newbalanceDest,440
2,oldbalanceDest,372
3,oldbalanceOrg,328
4,dest_balance_diff,306
5,orig_balance_diff,249
6,type_CASH_OUT,199
7,newbalanceOrig,112
8,type_TRANSFER,105
9,type_PAYMENT,27


## Step 9 – Save Outputs (optional)

In [9]:
clf.save_model('lgbm_model.txt')
imp.to_csv('feature_importance.csv', index=False)
# Expected: files saved in notebook directory
