<a href="https://colab.research.google.com/github/Mahaamimiii/Credit_Card_Fraud_Analysis/blob/main/Cred_Fraud_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/content/card_transdata.csv")
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['fraud'].value_counts()
df['fraud'].value_counts(normalize=True)*100

In [None]:
target = 'fraud'
category = ['repeat_retailer','used_chip','used_pin_number']
numeric = [col for col in df.columns
           if col not in category + [target]
]

In [None]:
sns.countplot(x='fraud',data=df)
plt.title("fraud vs non-fraud")

5. Distribution of Key Variables

In [None]:
df['distance_bin']=pd.cut(df['distance_from_home'],bins=10)
df['distance_bin'].value_counts()

In [None]:
df.groupby('distance_bin')['fraud'].mean()


In [None]:
bins = [0, 50, 100, 200, 500, df['distance_from_home'].max()]
labels = ['Very Near', 'Near', 'Medium', 'Far', 'Very Far']

df['distance_bin_custom'] = pd.cut(df['distance_from_home'],
    bins=bins,
    labels=labels,
    include_lowest=True
)


In [None]:
df.groupby('distance_bin')['fraud'].mean()


In [None]:
df.groupby('distance_bin_custom')['fraud'].mean()


In [None]:
fraud_stats = (
    df.groupby('distance_bin_custom')
      .agg(
          transactions=('fraud','count'),
          fraud_rate=('fraud','mean')
      )
      .reset_index()
)

fraud_stats


In [None]:
df['is_far'] = df['distance_from_home'] > 100

df.groupby('is_far')['fraud'].mean()


In [None]:
df.groupby('distance_bin_custom')['fraud'].agg(['count', 'mean'])


In [None]:
plt.figure(figsize=(8,4))
sns.countplot(
    data=df,
    x='distance_from_home',
    hue='fraud',
    bins=30,
    element='step'
)
plt.title("Distance from Home Distribution by Fraud")
plt.xlabel("Distance from Home")
plt.show()


KeyboardInterrupt: 

In [None]:
df['distance_bin_custom'].value_counts().sort_index()

7. Relationship With Target (fraud)

In [None]:
df.groupby('distance_bin_custom')['fraud'].agg(['count', 'mean'])


In [None]:
distance_fraud_stats = (
    df.groupby('distance_bin_custom')['fraud']
      .agg(count='count', fraud_rate='mean')
      .reset_index()
)


In [None]:
plt.figure(figsize=(8,4))
sns.barplot(
    data=distance_fraud_stats,
    x='distance_bin_custom',
    y='fraud_rate'
)
plt.title("Fraud Rate by Distance from Home (Binned)")
plt.xlabel("Distance from Home (Bins)")
plt.ylabel("Fraud Rate")
plt.xticks(rotation=45)
plt.show()


In [None]:
fraud_bin_stats = (
    df.groupby('distance_bin_custom')['fraud']
      .agg(count='count', fraud_rate='mean')
      .reset_index()
)
fraud_bin_stats


8) Binning Columns - distance_from_last_transaction

In [None]:
df['distance_from_last_transaction'].describe()

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(
    data=df,
    x='distance_from_last_transaction',
    bins=30,
    kde=True
)
plt.title("Distribution of Distance from Last Transaction")
plt.xlabel("Distance from Last Transaction")
plt.ylabel("Count")
plt.show()


In [None]:
bins = [0, 1, 5, 10, 20, 50, 100, np.inf]
labels = ['0–1', '1–5', '5–10', '10–20', '20–50', '50–100', '100+']

df['distance_from_last_transaction_bin'] = pd.cut(
    df['distance_from_last_transaction'],
    bins=bins,
    labels=labels,
    include_lowest=True
)



10.Relationship With Target (fraud)

In [None]:
df.groupby('distance_from_last_transaction_bin')['fraud'].agg(['count', 'mean'])


Observations

Most transactions have a small distance gap from the previous transaction.
Fraud rates remain stable (~8%) for short gaps.
When the distance from the last transaction becomes very large, fraud probability increases dramatically (approaching ~50%).

Key Insight

Transactions that occur far away from the previous transaction location show a substantial increase in fraud risk, indicating abnormal transaction sequences.

11. Binning columns - ratio_to_median_purchase_price

In [None]:
df['ratio_to_median_purchase_price'].describe()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df['ratio_to_median_purchase_price'], bins=50, kde=True)
plt.title("Distribution of Ratio to Median Purchase Price")
plt.show()


In [None]:
bins_ratio = [0, 0.5, 1, 2, 5, np.inf]
labels_ratio = [
    "<0.5",
    "0.5–1",
    "1–2",
    "2–5",
    "5+"
]


In [None]:
df['ratio_to_median_purchase_price_bin'] = pd.cut(
    df['ratio_to_median_purchase_price'],
    bins=bins_ratio,
    labels=labels_ratio,
    include_lowest=True
)

In [None]:
df['ratio_to_median_purchase_price_bin'].value_counts().sort_index()


Relationship With Target (fraud)

In [None]:
df.groupby('ratio_to_median_purchase_price_bin')['fraud'].agg(['count', 'mean'])

Ratio to Median Purchase Price (ratio_to_median_purchase_price)

In [None]:
df['used_chip'].value_counts()

In [None]:
df.groupby('used_chip')['fraud'].agg(['count', 'mean'])


Transactions where chip authentication is used exhibit a significantly lower fraud rate compared to non-chip transactions, indicating chip-based payments as a strong fraud-reducing factor.

In [None]:
df['used_pin_number'].value_counts()
df.groupby('used_pin_number')['fraud'].agg(['count', 'mean'])

PIN adds an extra authentication layer → should reduce fraud

In [None]:
df['online_order'].value_counts()
df.groupby('online_order')['fraud'].agg(['count', 'mean'])

Online transactions lack physical verification → higher fraud risk.

In [None]:
df['repeat_retailer'].value_counts()
df.groupby('repeat_retailer')['fraud'].agg(['count', 'mean'])

Repeated retailer behaviour shows minimal difference in fraud rates.

In [None]:
corr = df[
    [
        'distance_from_home',
        'distance_from_last_transaction',
        'ratio_to_median_purchase_price',
        'fraud'
    ]
].corr()

corr


Correlation Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


People who spend much more than usual are much more likely to commit fraud

Being far from home increases fraud risk a little

Sudden location jumps also increase risk a little

These three things are mostly independent of each other (they capture different behaviors)

In [None]:
df.groupby(['online_order', pd.cut(df['distance_from_home'], bins=[0,10,50,100,500,df['distance_from_home'].max()])])['fraud'].mean()


In [None]:
df.groupby(
    [
        'online_order',
        pd.cut(
            df['distance_from_last_transaction'],
            bins=[0, 1, 5, 10, 50, df['distance_from_last_transaction'].max()]
        )
    ]
)['fraud'].mean()


Fraud rate changes as distance_from_last_transaction increases

In [None]:
df.groupby(
    [
        'used_chip',
        pd.cut(
            df['distance_from_home'],
            bins=[0, 10, 50, 100, 500, df['distance_from_home'].max()]
        )
    ]
)['fraud'].mean()


Fraud rate changes as distance_from_home increases

In [None]:
df.groupby(
    [
        'used_chip',
        pd.cut(
            df['distance_from_last_transaction'],
            bins=[0, 10, 50, 100, 500, df['distance_from_last_transaction'].max()]
        )
    ]
)['fraud'].mean()

Bivariate analysis between distance from last transaction and chip usage shows that fraud risk increases sharply for large transaction gaps when the card chip is not used. However, when chip-based authentication is present, the increase in fraud probability is significantly mitigated, highlighting the importance of security mechanisms in reducing fraud during abnormal transaction patterns.

In [None]:
df.groupby(
    [
        'used_pin_number',
        pd.cut(
            df['ratio_to_median_purchase_price'],
            bins=[0, 0.5, 1, 2, 5, df['ratio_to_median_purchase_price'].max()]
        )
    ]
)['fraud'].mean()

Fraud rate increases when the pin is not used. Fraud rate decreases when pin is used.

Define features and Target variable

In [None]:
X = df[
    [
        'distance_from_home',
        'distance_from_last_transaction',
        'ratio_to_median_purchase_price',
        'used_chip',
        'used_pin_number',
        'online_order',
        'repeat_retailer'
    ]
]

y = df['fraud']

In [None]:
X.isna().sum()


In [None]:
binary_cols = ['used_chip', 'used_pin_number', 'online_order', 'repeat_retailer']

for col in binary_cols:
    X[col].fillna(X[col].mode()[0], inplace=True)

In [None]:
y.isna().sum()


In [None]:
y.fillna(y.mode()[0], inplace=True)

In [None]:
y.isna().sum()


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

For test dataset

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:, 1]

For the train dataset

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Train ROC-AUC:", roc_auc_score(y_train, y_train_prob))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, y_train_pred))
print("Train Classification Report:\n", classification_report(y_train, y_train_pred))

Finding out the Important feature columns

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_scaled, y)

In [None]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr.coef_[0],
    'Absolute_Coefficient': np.abs(lr.coef_[0])
})

feature_importance.sort_values(by='Absolute_Coefficient', ascending=False)


GridSearch for Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

Initialize Logistic Regression

In [None]:
log_reg = LogisticRegression(max_iter=1000)


In [None]:
grid = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)


In [None]:
grid.fit(X_train, y_train)

In [None]:
print("Best Parameters:", grid.best_params_)
print("Best ROC-AUC:", grid.best_score_)


In [None]:
best_model = grid.best_estimator_

y_pred_tuned = best_model.predict(X_test)
y_prob_tuned = best_model.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred_tuned))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_tuned))
print(confusion_matrix(y_test, y_pred_tuned))
print(classification_report(y_test, y_pred_tuned))
