In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [2]:
df = pd.read_csv('Fraud.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [4]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [5]:
df['isFraud'].describe()

count    6.362620e+06
mean     1.290820e-03
std      3.590480e-02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: isFraud, dtype: float64

In [6]:
# Assuming your DataFrame is named 'df'
null_counts = df.isnull().sum()

# Display the null value counts
print(null_counts)

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [7]:
# Check for missing values
missing_values = df.isnull().sum()

# Print the columns with missing values
print(missing_values[missing_values > 0])

# Handling missing values in numeric columns by filling with median
numeric_columns = df.select_dtypes(include='number').columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

# Handling missing values in non-numeric columns (example: filling with mode)
non_numeric_columns = df.select_dtypes(exclude='number').columns
for column in non_numeric_columns:
    df[column] = df[column].fillna(df[column].mode()[0])

# Verify if missing values are handled
print(df.isnull().sum())

Series([], dtype: int64)
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [8]:
import numpy as np

# Function to detect outliers using the IQR method
def detect_outliers(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[col] < lower_bound) | (data[col] > upper_bound)]

# Detecting and handling outliers for all numeric columns
for col in numeric_columns:
    outliers = detect_outliers(df, col)
    print(f"Outliers detected in {col}:")
    print(outliers)
    
    # Handling outliers by capping
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

Outliers detected in step:
         step      type      amount     nameOrig  oldbalanceOrg  \
6259932   604  TRANSFER   714218.48   C454938739      714218.48   
6259933   604  CASH_OUT   714218.48  C1835708623      714218.48   
6259934   604  TRANSFER   277582.53  C2144116571      277582.53   
6259935   604  CASH_OUT   277582.53  C1512896686      277582.53   
6259936   604  TRANSFER  4672401.04   C801580496     4672401.04   
...       ...       ...         ...          ...            ...   
6362615   743  CASH_OUT   339682.13   C786484425      339682.13   
6362616   743  TRANSFER  6311409.28  C1529008245     6311409.28   
6362617   743  CASH_OUT  6311409.28  C1162922333     6311409.28   
6362618   743  TRANSFER   850002.52  C1685995037      850002.52   
6362619   743  CASH_OUT   850002.52  C1280323807      850002.52   

         newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
6259932             0.0   C482986731            0.00            0.00        1   
625993

In [9]:
# Assuming your DataFrame is named 'df'
count_nameOrig_starts_with_M = df[df['nameDest'].str.startswith('M')].shape[0]

# Display the count of samples where nameDest starts with 'M'
print(count_nameOrig_starts_with_M)

2151495


In [10]:
# Assuming your DataFrame is named 'df'
count_nameOrig_starts_with_M = df[df['nameOrig'].str.startswith('M')].shape[0]

# Display the count of samples where nameOrig starts with 'M'
print(count_nameOrig_starts_with_M)

0


In [11]:
# Assuming your DataFrame is named 'df'
def label_encode_nameDest(nameDest):
    if nameDest.startswith('M'):
        return 0
    elif nameDest.startswith('C'):
        return 1
    else:
        return None  # Handle other cases if necessary

df['nameDest_encoded'] = df['nameDest'].apply(label_encode_nameDest)
# Display the updated DataFrame
print(df[['nameDest', 'nameDest_encoded']])

            nameDest  nameDest_encoded
0        M1979787155                 0
1        M2044282225                 0
2         C553264065                 1
3          C38997010                 1
4        M1230701703                 0
...              ...               ...
6362615   C776919290                 1
6362616  C1881841831                 1
6362617  C1365125890                 1
6362618  C2080388513                 1
6362619   C873221189                 1

[6362620 rows x 2 columns]


In [12]:
from sklearn.preprocessing import LabelEncoder

# Assuming your DataFrame is named 'df'
le = LabelEncoder()

df['type_encoded'] = le.fit_transform(df['type'])

# Display the updated DataFrame
print(df[['type', 'type_encoded']])

             type  type_encoded
0         PAYMENT             3
1         PAYMENT             3
2        TRANSFER             4
3        CASH_OUT             1
4         PAYMENT             3
...           ...           ...
6362615  CASH_OUT             1
6362616  TRANSFER             4
6362617  CASH_OUT             1
6362618  TRANSFER             4
6362619  CASH_OUT             1

[6362620 rows x 2 columns]


In [13]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,nameDest_encoded,type_encoded
0,1.0,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0,0,3
1,1.0,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0,0,3
2,1.0,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,0.0,0.0,1,4
3,1.0,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,0.0,0.0,1,1
4,1.0,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0,0,3


In [14]:
df.columns.tolist()

['step',
 'type',
 'amount',
 'nameOrig',
 'oldbalanceOrg',
 'newbalanceOrig',
 'nameDest',
 'oldbalanceDest',
 'newbalanceDest',
 'isFraud',
 'isFlaggedFraud',
 'nameDest_encoded',
 'type_encoded']

In [15]:
features = ['step','type_encoded', 'amount', 'nameDest_encoded', 'oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','isFlaggedFraud']
target = 'isFraud'

X = df[features]
y = df[target]

In [16]:
X.head()

Unnamed: 0,step,type_encoded,amount,nameDest_encoded,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
0,1.0,3,9839.64,0,170136.0,160296.36,0.0,0.0,0.0
1,1.0,3,1864.28,0,21249.0,19384.72,0.0,0.0,0.0
2,1.0,4,181.0,1,181.0,0.0,0.0,0.0,0.0
3,1.0,1,181.0,1,181.0,0.0,21182.0,0.0,0.0
4,1.0,3,11668.14,0,41554.0,29885.86,0.0,0.0,0.0


In [17]:
# Feature Selection
X = df.drop('isFraud', axis=1)
y = df['isFraud']

In [19]:
# Preprocessing Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [20]:
# Create a pipeline that first preprocesses the data and then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [21]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
# Fit the model
pipeline.fit(X_train, y_train)

In [23]:
# Model Evaluation
y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[1272524]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   1272524

    accuracy                           1.00   1272524
   macro avg       1.00      1.00      1.00   1272524
weighted avg       1.00      1.00      1.00   1272524



In [24]:
# Check the output shape of predict_proba
y_pred_prob = pipeline.predict_proba(X_test)
print("Shape of predict_proba output:", y_pred_prob.shape)

if y_pred_prob.shape[1] == 2:
    # ROC Curve
    y_pred_prob = y_pred_prob[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
else:
    print("The classifier did not return probability estimates for both classes.")



Shape of predict_proba output: (1272524, 1)
The classifier did not return probability estimates for both classes.


In [68]:
# Key Factors Identification
model = pipeline.named_steps['classifier']
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

In [69]:
# Extract feature names after preprocessing
feature_names = numeric_features + list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features))

In [70]:
for f in range(X.shape[1]):
    print(f"{feature_names[indices[f]]}: {importances[indices[f]]}")

nameDest_M999999784: 0.0
nameOrig_C1931755826: 0.0
nameOrig_C1931809440: 0.0
nameOrig_C1931809693: 0.0
nameOrig_C1931809805: 0.0
nameOrig_C1931810605: 0.0
nameOrig_C1931810742: 0.0
nameOrig_C1931811013: 0.0
nameOrig_C1931811224: 0.0
nameOrig_C19318114: 0.0
nameOrig_C1931811449: 0.0
nameOrig_C1931811531: 0.0


In [71]:
# Infrastructure Update Suggestions
def prevention_measures():
    measures = [
        "Implement multi-factor authentication.",
        "Regularly update and patch systems.",
        "Monitor transactions in real-time.",
        "Use encryption for sensitive data.",
        "Conduct regular security audits."
    ]
    return measures

print(prevention_measures())

['Implement multi-factor authentication.', 'Regularly update and patch systems.', 'Monitor transactions in real-time.', 'Use encryption for sensitive data.', 'Conduct regular security audits.']


In [72]:
# Implementation Evaluation
def evaluate_implementation():
    # Suggest metrics to evaluate the implemented actions
    metrics = [
        "Reduction in the number of fraudulent transactions.",
        "Improvement in detection rate of fraudulent transactions.",
        "Decrease in false positive rate.",
        "User feedback on system changes.",
        "Regular security audit results."
    ]
    return metrics

print(evaluate_implementation())

['Reduction in the number of fraudulent transactions.', 'Improvement in detection rate of fraudulent transactions.', 'Decrease in false positive rate.', 'User feedback on system changes.', 'Regular security audit results.']
