In [2]:
from google.colab import files
uploaded = files.upload()

Saving creditcard.csv to creditcard.csv


In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import NearestNeighbors
try:
    df = pd.read_csv("creditcard.csv")
    print("Dataset loaded.")
except:
    print("creditcard.csv not found — creating simple synthetic dataset.")
    np.random.seed(42)
    df = pd.DataFrame({
        "V1": np.random.normal(0, 1, 2000),
        "V2": np.random.normal(0, 1, 2000),
        "Amount": np.random.exponential(50, 2000),
        "Class": np.random.choice([0, 1], 2000, p=[0.97, 0.03])   # 3% fraud
    })

print(df.head())
X = df[["V1", "V2", "Amount"]]
y = df["Class"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

model = LogisticRegression(class_weight="balanced")
model.fit(X_train, y_train)

print("\n=== MODEL EVALUATION ===")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
knn = NearestNeighbors(n_neighbors=3)
knn.fit(X_scaled)
print("\n=== PREDICT A NEW TRANSACTION ===")
new_tx = np.array([[0.5, -1.2, 120]])
new_tx_scaled = scaler.transform(new_tx)

prediction = int(model.predict(new_tx_scaled)[0])
prob = float(model.predict_proba(new_tx_scaled)[0][1])

print("Predicted Class:", prediction, " (1 = fraud, 0 = legit)")
print("Probability of Fraud:", round(prob, 4))
distances, indices = knn.kneighbors(new_tx_scaled)
indices = indices[0]

print("\n=== SIMILAR PAST TRANSACTIONS (RAG) ===")
fraud_count = 0
for idx in indices:
    row = df.iloc[idx]
    print(f"V1={row.V1:.2f}, V2={row.V2:.2f}, Amount={row.Amount:.2f}, Class={row.Class}")
    fraud_count += row.Class
print("\n=== SIMPLE EXPLANATION ===")
if prediction == 1:
    print(" The model predicts this transaction is FRAUD.")
else:
    print(" The model predicts this transaction is LEGITIMATE.")

print(f"Among the 3 most similar past transactions, {fraud_count} were frauds.")

if fraud_count >= 2:
    print("Explanation: Similar transactions in history were mostly fraudulent, so risk is high.")
elif fraud_count == 1:
    print(" Explanation: A few similar past cases were fraud, so review is recommended.")
else:
    print(" Explanation: Historical similar cases were clean, so risk is low.")


Dataset loaded.
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26     

