<a href="https://colab.research.google.com/github/Narendra69/Predctive-analytic/blob/credit-card-fraud-detection/Credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# --- 1. Load and Explore the Data ---

# Load the dataset from the CSV file
data = pd.read_csv('/content/creditcard.csv', on_bad_lines='skip') # Added on_bad_lines='skip'


print("--- Data Head ---")
print(data.head())
print("\n--- Data Description ---")
print(data.describe())

# Check for the class imbalance
print("\n--- Class Distribution ---")
class_distribution = data['Class'].value_counts()
print(class_distribution)
print(f"\nLegitimate Transactions (Class 0): {class_distribution[0]}")
print(f"Fraudulent Transactions (Class 1): {class_distribution[1]}")
print(f"Percentage of Fraud: {class_distribution[1] / len(data) * 100:.4f}%")
print("-" * 30)


# --- 2. Pre-processing ---

# The 'Time' and 'Amount' columns are not scaled like the others (V1, V2, etc.).
# We'll scale them to prevent them from overly influencing the model.
scaler = StandardScaler()
data['scaled_Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
# We can drop the original 'Time' and 'Amount' columns
data = data.drop(['Time', 'Amount'], axis=1)

# Drop rows with missing values in the 'Class' column
data.dropna(subset=['Class'], inplace=True)

# --- 3. Prepare Data for Modeling ---

# Define features (X) and target (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
# We use 'stratify=y' to ensure the class distribution is the same in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 4. Train a Baseline Model (Logistic Regression) ---
print("\n--- Training Logistic Regression (Baseline) ---")
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)

print("\n--- Logistic Regression Results ---")
print("Confusion Matrix:")
# Note: In a confusion matrix, the rows are the actual classes and columns are the predicted classes.
# [[True Negatives, False Positives],
#  [False Negatives, True Positives]]
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['Not Fraud (0)', 'Fraud (1)']))


# --- 5. Train an Advanced Model (Random Forest) ---
# Random Forest is better for complex, non-linear problems and imbalanced data.
# `class_weight='balanced'` tells the model to pay more attention to the minority class (fraud).
print("\n--- Training Random Forest Classifier ---")
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1  # Use all available CPU cores
)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

print("\n--- Random Forest Results ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Not Fraud (0)', 'Fraud (1)']))


# --- 6. Summary and Interpretation ---
print("\n--- Model Comparison Summary ---")
# For fraud (class 1):
lr_recall = confusion_matrix(y_test, y_pred_lr)[1, 1] / (confusion_matrix(y_test, y_pred_lr)[1, 1] + confusion_matrix(y_test, y_pred_lr)[1, 0])
rf_recall = confusion_matrix(y_test, y_pred_rf)[1, 1] / (confusion_matrix(y_test, y_pred_rf)[1, 1] + confusion_matrix(y_test, y_pred_rf)[1, 0])

print(f"Logistic Regression caught {lr_recall*100:.2f}% of the fraud cases in the test set.")
print(f"Random Forest caught {rf_recall*100:.2f}% of the fraud cases in the test set.")

print("\nInterpretation:")
print("The Logistic Regression model has high precision but very poor recall for fraud cases. It correctly identified only a portion of the fraudulent transactions.")
print("The Random Forest model, especially with `class_weight='balanced'`, performs much better. Its recall is significantly higher, meaning it successfully identified a much larger percentage of the actual fraud cases, even if it meant incorrectly flagging a few more legitimate transactions (lower precision).")
print("In fraud detection, high recall is often the primary goal.")

--- Data Head ---
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26   

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# --- This part is the same as before, to get a trained model ---
# --- In a real application, you would save and load the model, but for this example, we'll just retrain it. ---

# 1. Load data
try:
    df = pd.read_csv('/content/creditcard.csv')
except FileNotFoundError:
    print("Error: 'creditcard.csv' not found. Please place it in the directory.")
    exit()

# 2. Pre-process
# We need to create and fit the scaler on the full dataset so we can use it later
scaler = StandardScaler()
df['scaled_Amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
df = df.drop(['Time', 'Amount'], axis=1)

# 3. Prepare data and train the model
X = df.drop('Class', axis=1)
y = df['Class']

# For this example, we'll train on the full dataset to make the model as robust as possible
# In a real scenario, you'd train on your training set and save the model and the scaler
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
print("Training the final model on all available data...")
rf_model.fit(X, y)
print("Model training complete.\n")


# --- THIS IS THE NEW PART: CHECKING A SINGLE TRANSACTION ---

# 4. Create hypothetical new transactions to check
# The features V1, V2, etc., are principal components, so we'll use values from the existing dataset.
# We'll grab a known non-fraudulent and a known fraudulent transaction to simulate.
known_non_fraud_row = df[df['Class'] == 0].iloc[0].drop('Class')
known_fraud_row = df[df['Class'] == 1].iloc[0].drop('Class')

# Let's create a custom "suspicious" transaction.
# We'll use the non-fraud row as a base and change the amount to something very large.
# Note: The 'scaled_Amount' is what matters. A raw amount of $5000 would be scaled.
# We will manually scale it using the *already fitted* scaler.
high_amount = 5000
scaled_high_amount = scaler.transform(np.array([[high_amount]]))[0][0]

suspicious_transaction_dict = known_non_fraud_row.to_dict()
suspicious_transaction_dict['scaled_Amount'] = scaled_high_amount
# Let's also change a few V-features to make it more anomalous
suspicious_transaction_dict['V4'] = 5.5
suspicious_transaction_dict['V11'] = 4.2


# 5. Format the new data for the model
# The model expects a pandas DataFrame with the columns in the correct order.
normal_transaction = pd.DataFrame([known_non_fraud_row.to_dict()])
suspicious_transaction = pd.DataFrame([suspicious_transaction_dict])

# Ensure column order is the same as the training data (X.columns)
normal_transaction = normal_transaction[X.columns]
suspicious_transaction = suspicious_transaction[X.columns]


# 6. Make Predictions
print("--- Checking a known NORMAL transaction ---")
prediction_normal = rf_model.predict(normal_transaction)
prediction_proba_normal = rf_model.predict_proba(normal_transaction)

print(f"Prediction (0=Normal, 1=Fraud): {prediction_normal[0]}")
# predict_proba returns probabilities for [Class 0, Class 1]
print(f"Fraud Probability: {prediction_proba_normal[0][1] * 100:.2f}%")


print("\n--- Checking a SUSPICIOUS transaction ---")
prediction_suspicious = rf_model.predict(suspicious_transaction)
prediction_proba_suspicious = rf_model.predict_proba(suspicious_transaction)

print(f"Prediction (0=Normal, 1=Fraud): {prediction_suspicious[0]}")
print(f"Fraud Probability: {prediction_proba_suspicious[0][1] * 100:.2f}%")



Training the final model on all available data...
Model training complete.

--- Checking a known NORMAL transaction ---
Prediction (0=Normal, 1=Fraud): 0
Fraud Probability: 0.00%

--- Checking a SUSPICIOUS transaction ---
Prediction (0=Normal, 1=Fraud): 0
Fraud Probability: 0.00%
