In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import logging
import json
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Configure logging
logging.basicConfig(
    filename='fraud_detection.log',
    level=logging.INFO,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

# Load the financial dataset
try:
    df = pd.read_csv('financial_data.csv')  # Ensure this file exists in your working directory
    logging.info("Financial dataset loaded successfully.")
except Exception as e:
    logging.error(f"Error loading financial dataset: {e}")
    raise

# Define SLAs for data quality
SLAs = {
    'accuracy': 0.99,       # 99% of records should be accurate
    'completeness': 0.98,   # 98% of records should be complete
    'timeliness': 0.95      # 95% of data should be timely
}

# Data Quality Checks
def data_quality_checks(data):
    total_records = len(data)
    missing_values = data.isnull().sum().sum()
    completeness = 1 - (missing_values / (total_records * data.shape[1]))

    # For accuracy, assuming we have a method to verify accuracy; here we simulate it
    # In practice, this would involve comparing with a trusted source
    accuracy = np.random.uniform(0.95, 1.0)  # Placeholder for demonstration

    # Timeliness check: assuming 'transaction_date' column exists
    if 'transaction_date' in data.columns:
        data['transaction_date'] = pd.to_datetime(data['transaction_date'], errors='coerce')
        timely_records = data['transaction_date'].notnull().sum()
        timeliness = timely_records / total_records
    else:
        timeliness = np.nan  # Cannot compute timeliness without 'transaction_date'

    quality_metrics = {
        'accuracy': accuracy,
        'completeness': completeness,
        'timeliness': timeliness
    }

    return quality_metrics

# Perform data quality checks
quality_metrics = data_quality_checks(df)
logging.info(f"Data Quality Metrics: {quality_metrics}")

# Compare with SLAs
def compare_with_sla(metrics, sla):
    for key in sla:
        if key in metrics and metrics[key] is not None:
            if metrics[key] < sla[key]:
                logging.warning(f"{key.capitalize()} SLA not met: {metrics[key]:.2f} < {sla[key]}")
            else:
                logging.info(f"{key.capitalize()} SLA met: {metrics[key]:.2f} >= {sla[key]}")
        else:
            logging.warning(f"{key.capitalize()} metric not available for SLA comparison.")

compare_with_sla(quality_metrics, SLAs)

# Data Preprocessing
# Drop rows with missing values
df_clean = df.dropna()
logging.info(f"Data cleaned. Rows before: {len(df)}, after cleaning: {len(df_clean)}")

# Feature selection: assuming 'amount' and 'transaction_type' are relevant
features = ['amount']  # Add more relevant features as needed
if 'transaction_type' in df_clean.columns:
    # One-hot encode categorical variable
    df_encoded = pd.get_dummies(df_clean, columns=['transaction_type'], drop_first=True)
    features.extend([col for col in df_encoded.columns if col.startswith('transaction_type_')])
else:
    df_encoded = df_clean

X = df_encoded[features]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler for future use
joblib.dump(scaler, 'scaler.pkl')

# Train Isolation Forest for anomaly detection
model = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)
model.fit(X_scaled)

# Save the model for future use
joblib.dump(model, 'fraud_detection_model.pkl')

# Predict anomalies
df_encoded['anomaly'] = model.predict(X_scaled)
df_encoded['anomaly'] = df_encoded['anomaly'].map({1: 0, -1: 1})  # 1 for normal, -1 for anomaly

# Evaluate model performance
# Assuming 'is_fraud' column exists as ground truth
if 'is_fraud' in df_encoded.columns:
    y_true = df_encoded['is_fraud']
    y_pred = df_encoded['anomaly']
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    logging.info(f"Model Evaluation Metrics - Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}, Accuracy: {accuracy:.2f}")
else:
    logging.warning("Ground truth labels 'is_fraud' not available. Skipping model evaluation.")

# Visualize anomalies
plt.figure(figsize=(10, 6))
sns.scatterplot(x='amount', y='anomaly', data=df_encoded, hue='anomaly', palette='coolwarm')
plt.title('Anomaly Detection in Financial Transactions')
plt.xlabel('Transaction Amount')
plt.ylabel('Anomaly')
plt.legend(title='Anomaly')
plt.tight_layout()
plt.savefig('anomaly_detection_plot.png')
plt.show()