# Transaction Fraud Processing


## Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Import Dataset

In [2]:
dataset = pd.read_csv('Synthetic_Financial_datasets_log.csv')
original_headers = dataset.columns.tolist()

## Save and remove string values from dataset before processing

In [3]:
identifiers = dataset[['nameOrig', 'nameDest']].copy()
dataset = dataset.drop(['nameOrig', 'nameDest'], axis=1)

##Remove Headers and save

In [4]:
X_headers = dataset.drop(['isFraud', 'isFlaggedFraud'], axis=1).columns.tolist()

## Identify Data for Processing

In [5]:
X = dataset.drop(['isFraud', 'isFlaggedFraud'], axis=1).values
y = dataset['isFraud'].values

## Encode Payment Type

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Get final column names after encoding

In [7]:
onehot_columns = ct.named_transformers_['encoder'].get_feature_names_out(['type'])
rest_columns = [col for i, col in enumerate(X_headers) if i != 1]
final_headers = list(onehot_columns) + rest_columns

In [8]:
print(X.shape)
print(X[0])

(6362620, 11)
[0.0 0.0 0.0 1.0 0.0 1 9839.64 170136.0 160296.36 0.0 0.0]


## Split Training and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Impliment Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

## Predict outcomes

In [21]:
y_pred = classifier.predict(X_test)

## Output Probabilities

In [22]:
y_proba = classifier.predict_proba(X_test)[:, 1]

## Combine test outcome with headers and add predictions

In [None]:
X_test_df = pd.DataFrame(X_test, columns=X_headers)
X_test_df['actual_isFraud'] = y_test
X_test_df['predicted_isFraud'] = y_pred
X_test_df['fraud_probability'] = y_proba

## Evaluate the model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

## Compare with Actual

In [24]:
predicted_df = pd.DataFrame(X_test)  # or use column names if you have them
predicted_df['actual_isFraud'] = y_test
predicted_df['predicted_isFraud'] = y_pred
predicted_df['fraud_probability'] = y_proba

In [25]:
predicted_df.to_csv('fraud_predictions.csv', index=False)

## Histogram

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Filter out 0 probabilities
non_zero = X_test_df[X_test_df['fraud_probability'] > 0]

plt.figure(figsize=(10, 5))
sns.histplot(non_zero['fraud_probability'], bins=50, kde=True)
plt.title("Distribution of Fraud Probabilities (Excluding Zero)")
plt.xlabel("Fraud Probability")
plt.ylabel("Count")
plt.show()

## Linear Model of Fraud Probability

In [None]:
non_zero_sorted = non_zero.sort_values(by='fraud_probability').reset_index(drop=True)
non_zero_sorted['index'] = non_zero_sorted.index

plt.figure(figsize=(10, 5))
sns.regplot(data=non_zero_sorted, x='index', y='fraud_probability', scatter=False, line_kws={"color":"red"})
plt.title("Trend of Fraud Probability (Sorted, Excluding Zero)")
plt.xlabel("Ranked Transaction")
plt.ylabel("Fraud Probability")
plt.show()

## Top Suspicious Transactions

In [None]:
top_frauds = non_zero.sort_values(by='fraud_probability', ascending=False).head(20)
print(top_frauds[['nameOrig', 'nameDest', 'fraud_probability', 'predicted_isFraud']])

## Precision Recall

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(10,5))
plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision-Recall vs Threshold')
plt.legend()
plt.grid(True)
plt.show()