In [1]:
# Cell 1: import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# Optional: handle class imbalance
try:
    from imblearn.over_sampling import SMOTE
    imblearn_available = True
except Exception:
    imblearn_available = False
    print("imblearn not installed — you can install it with: pip install imblearn")


In [2]:
# Cell 2: load dataset
file_path = r'C:\Users\u\Desktop\india.csv'  
df = pd.read_csv(file_path, sep=',') 

#Displalying data
print("Rows,Cols:", df.shape)
df.head()


Rows,Cols: (7953, 10)


Unnamed: 0,transaction_id,customer_id,merchant_id,amount,transaction_time,card_type,location,purchase_category,customer_age,is_fraudulent
0,1.0,684415.0,2028.0,1262.77,11/24/2023 22:39,Rupay,Bangalore,,28.0,0.0
1,2.0,447448.0,2046.0,2222.928,03/30/2024 16:18,MasterCard,Surat,POS,62.0,0.0
2,3.0,975001.0,2067.0,7509.832,03/07/2024 18:27,MasterCard,Hyderabad,POS,24.0,0.0
3,4.0,976547.0,,2782.965,02/01/2024 0:58,Rupay,Hyderabad,Digital,62.0,0.0
4,5.0,935741.0,2044.0,,12/22/2023 18:42,,Bangalore,Digital,19.0,0.0


In [3]:
# Cell 3: Set target column
target_col = 'is_fraudulent'  # column to predict

# Convert transaction_time to datetime and extract numeric features
df['transaction_time'] = pd.to_datetime(df['transaction_time'], errors='coerce')
df['hour'] = df['transaction_time'].dt.hour.fillna(0).astype(int)
df['month'] = df['transaction_time'].dt.month.fillna(0).astype(int)
df['day'] = df['transaction_time'].dt.day.fillna(0).astype(int)

# Drop original transaction_time (we already extracted numeric features)
df.drop(columns=['transaction_time'], inplace=True)

# Fill missing numeric columns with 0
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(0)

# Handle categorical columns using one-hot encoding
categorical_cols = ['card_type','location','purchase_category']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]

print("Features shape:", X.shape)
print("Target distribution:\n", y.value_counts())


Features shape: (7953, 20)
Target distribution:
 is_fraudulent
0.0    5688
1.0    2265
Name: count, dtype: int64


In [4]:
# Cell 4: Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (6362, 20) Test: (1591, 20)


In [5]:
# Cell 5: Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# Cell 6: Optional SMOTE oversampling
if imblearn_available:
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)
    print("After SMOTE, class counts:", np.bincount(y_train_res))
else:
    X_train_res, y_train_res = X_train_scaled, y_train
    print("SMOTE not used, using original training set:", np.bincount(y_train_res))


After SMOTE, class counts: [4550 4550]


In [7]:
# Cell 7: Train Random Forest
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'  # helps with imbalanced classes
)
rf.fit(X_train_res, y_train_res)


In [8]:
# Cell 8: Predictions
y_prob = rf.predict_proba(X_test_scaled)[0,1]
threshold=0.3
y_pred = (y_prob>= threshold).astype(int)


print("Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

roc = roc_auc_score(y_test, y_prob)
print("ROC-AUC:", round(roc,4))

# Confusion matrix plot
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


Classification Report:


InvalidParameterError: The 'y_pred' parameter of classification_report must be an array-like or a sparse matrix. Got 0 instead.

In [None]:
# Cell 9: CREDIT CARD FRAUD DETECTION RESULTS (Original Numbers)

# 1. Make predictions on scaled test set
y_pred = rf.predict(X_test_scaled)
y_prob = rf.predict_proba(X_test_scaled)[0, 1]

# 2. Add predictions and actual labels to a copy of the ORIGINAL test data
fraud_predictions = X_test.copy()  # use unscaled numbers
fraud_predictions['Predicted'] = y_pred
fraud_predictions['Actual'] = y_test.values
fraud_predictions['Fraud_Probability'] = y_prob

# 3. Reset index for clean display
fraud_predictions = fraud_predictions.reset_index(drop=True)

# 4. Show top 10 predictions for overview
print("All Predictions Overview (Original Numbers):")
display(fraud_predictions.head(10))

# 5. Show only predicted fraudulent transactions
fraud_only = fraud_predictions[fraud_predictions['Predicted'] == 1]
print("Predicted Fraudulent Transactions (Original Numbers):")
display(fraud_only)

# 6. Show top 10 transactions with highest fraud probability
print("Top 10 Transactions with Highest Fraud Probability:")
display(fraud_predictions.sort_values(by='Fraud_Probability', ascending=False).head(10))



In [None]:
# Cell 10: Feature importance
importances = rf.feature_importances_
feat_names = X.columns
imp_df = pd.DataFrame({'feature': feat_names, 'importance': importances}).sort_values('importance', ascending=False)
display(imp_df.head(20))


In [None]:
# Cell 11: Save model and scaler for later use
import joblib
joblib.dump(rf, 'rf_fraud_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
print("Model and scaler saved.")


In [None]:
!pip install streamlit joblib

In [None]:
%%writefile fraud_app.py
import streamlit as st
import pandas as pd
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

# Load trained model and scaler
rf = joblib.load('rf_fraud_model.joblib')
scaler = joblib.load('scaler.joblib')

# Load test data to calculate real metrics
# Replace with your dataset path
file_path = r'C:\Users\u\Desktop\india.csv'
df = pd.read_csv(file_path, sep=',')

# --- Preprocessing same as training ---
target_col = 'is_fraudulent'
df['transaction_time'] = pd.to_datetime(df['transaction_time'], errors='coerce')
df['hour'] = df['transaction_time'].dt.hour.fillna(0).astype(int)
df['month'] = df['transaction_time'].dt.month.fillna(0).astype(int)
df['day'] = df['transaction_time'].dt.day.fillna(0).astype(int)
df.drop(columns=['transaction_time'], inplace=True)
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(0)
categorical_cols = ['card_type','location','purchase_category']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

X = df.drop(columns=[target_col])
y = df[target_col]

# Align columns with training model
model_columns = [
    'transaction_id', 'customer_id', 'merchant_id', 'amount', 'customer_age', 'hour', 'month', 'day',
    'card_type_Rupay', 'card_type_Visa', 'location_Bangalore', 'location_Chennai', 'location_Delhi',
    'location_Hyderabad', 'location_Jaipur', 'location_Kolkata', 'location_Mumbai', 'location_Pune',
    'location_Surat', 'purchase_category_POS'
]

for col in model_columns:
    if col not in X.columns:
        X[col] = 0
X = X[model_columns]

# Scale features
X_scaled = scaler.transform(X)

# Predict on full test set
y_pred = rf.predict(X_scaled)
y_prob = rf.predict_proba(X_scaled)[:,1]

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
roc = roc_auc_score(y, y_prob)

st.title("CREDIT CARD FRAUD DETECTION")
st.write(f"**Model Accuracy:** {round(accuracy*100,2)}%")
st.write(f"**ROC-AUC Score:** {round(roc,4)}")

# --- Dropdown options ---
card_types = ["Rupay", "Visa", "MasterCard","other"]
locations = ["Bangalore","Chennai","Delhi","Hyderabad","Jaipur","Kolkata","Mumbai","Pune","Surat","Ahmedabad","Andhra Pradesh","Vishakapatnam"]
purchase_categories = ["POS","Digital","other"]

# --- User Input ---
transaction_id = st.number_input("Transaction ID", step=1)
customer_id = st.number_input("Customer ID", step=1)
merchant_id = st.number_input("Merchant ID", step=1)
amount = st.number_input("Amount", step=0.01)
customer_age = st.number_input("Customer Age", step=1)
hour = st.number_input("Transaction Hour (0-23)", step=1, min_value=0, max_value=23)
month = st.number_input("Transaction Month (1-12)", step=1, min_value=1, max_value=12)
day = st.number_input("Transaction Day (1-31)", step=1, min_value=1, max_value=31)
card_type = st.selectbox("Card Type", card_types)
location = st.selectbox("Location", locations)
purchase_category = st.selectbox("Purchase Category", purchase_categories)

# --- Predict ---
if st.button("PREDICT FRAUD"):
    # Create DataFrame
    new_trans = pd.DataFrame({
        'transaction_id':[transaction_id],
        'customer_id':[customer_id],
        'merchant_id':[merchant_id],
        'amount':[amount],
        'customer_age':[customer_age],
        'hour':[hour],
        'month':[month],
        'day':[day],
        'card_type':[card_type],
        'location':[location],
        'purchase_category':[purchase_category]
    })
    
    # One-hot encode categorical features
    new_trans_encoded = pd.get_dummies(new_trans, columns=['card_type','location','purchase_category'])
    
    # Add missing columns as 0
    for col in model_columns:
        if col not in new_trans_encoded.columns:
            new_trans_encoded[col] = 0
            
    # Ensure column order matches training
    new_trans_encoded = new_trans_encoded[model_columns]
    
    # Scale features
    new_trans_scaled = scaler.transform(new_trans_encoded)
    
    # Predict
    pred = rf.predict(new_trans_scaled)[0]
    prob = rf.predict_proba(new_trans_scaled)[0,1]
    
    st.write("Predicted Fraud", "Yes" if pred==1 else "No")
    st.write("Fraud Probability:", round(prob, 4))
