In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv")
data

In [None]:
# test = data['nameOrig'].duplicated()

In [None]:
# data.iloc[59]

In [None]:
# data.iloc[834]

In [None]:
# len(data[data['nameOrig'].duplicated(keep=False)].index)

In [None]:
# len(data[data['nameDest'].duplicated(keep=False)].index)

In [None]:
# data = data.drop(columns=['nameDest', 'nameOrig', 'isFlaggedFraud'])

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data["type"] = label_encoder.fit_transform(data["type"])
data["nameOrig"] = label_encoder.fit_transform(data["nameOrig"])
data["nameDest"] = label_encoder.fit_transform(data["nameDest"])

In [None]:
correlation_matrix = data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
x = data.drop(columns=['isFraud'])
y = data['isFraud']

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x, y = oversample.fit_resample(x, y)

In [None]:
df_combined = pd.concat([x, y], axis=1)

In [None]:
df_combined

In [None]:
correlation_matrix = df_combined.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
df_combined = df_combined.drop(columns=['isFlaggedFraud'])

In [None]:
x = df_combined.drop(columns=['isFraud'])
y = df_combined['isFraud']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report ,f1_score , recall_score,precision_score

In [None]:
experiment_id  = mlflow.create_experiment(
        name="testing_flow",
        artifact_location="testing_mlflow_artifact",
        tags = {"env":"dev","version":"1.0.0"},
    )


In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=2),
    'Decision Tree Classifier': DecisionTreeClassifier()
}

# Step 4: Store results
results = {}

# Step 5: Train models and calculate metrics
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    # Calculate Precision, Recall, and F1 Score
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        'Accuracy' : accuracy,
        'Confusion Matrix': cm.flatten(),  # Flatten to get TN, FP, FN, TP
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters and enable GPU usage
params = {
    'max_depth': 20,               # Depth of each tree
    'objective': 'binary:logistic', # For binary classification
    'eval_metric': 'auc',          # Evaluation metric
    'tree_method': 'gpu_hist',     # Use GPU accelerated tree building
    'predictor': 'gpu_predictor'   # Use GPU for prediction
}

# Train model
model_XGB = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')])
y_pred_prob = model_XGB.predict(dtest)
y_pred_XGB = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

cm = confusion_matrix(y_test, y_pred_XGB)
accuracy = accuracy_score(y_test, y_pred_XGB)
# Calculate Precision, Recall, and F1 Score
precision = precision_score(y_test, y_pred_XGB)
recall = recall_score(y_test, y_pred_XGB)
f1 = f1_score(y_test, y_pred_XGB)

# Store results
results["XGB"] = {
    'Accuracy' : accuracy,
    'Confusion Matrix': cm.flatten(),  # Flatten to get TN, FP, FN, TP
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
}

In [None]:
results

In [None]:
# Step 6: Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).transpose()

# Step 7: Display results
results_df.index.name = 'Model'
results_df.columns = ['Accuracy','Confusion Matrix (TN, FP, FN, TP)', 'Precision', 'Recall', 'F1 Score']
results_df

In [None]:
!pip install shap

In [None]:
import shap
explainer = shap.Explainer(model_XGB)
shap_values = explainer(X_test)
shap_values.shape

In [None]:
shap.plots.waterfall(shap_values[5,:])

In [None]:
shap.plots.bar(shap_values[4,:])

In [None]:
shap.summary_plot(shap_values=shap_values[: ,:], features=X_test.iloc[:, :])

In [None]:
shap_values.shape

In [None]:
from transformers import pipeline
import numpy as np

# Function to generate a natural language explanation based on SHAP values
def generate_nlp_report(shap_values, features, top_n=6):
    """
    Generates a natural language report based on SHAP values.

    Args:
        shap_values: A NumPy array of SHAP values. Should be 2D with shape (num_features, num_classes).
        features: A pandas DataFrame containing the features.
        top_n: The number of top contributing features to include in the report.

    Returns:
        A natural language report as a string.
    """

    # Get the top contributing features for fraud
    # print(shap_values)
    shap_abs_values = np.abs(shap_values.values)  # Assuming shap_values is now a 1D array
    # If shap_values is 2D (e.g., (num_features, num_classes)),
    # you might need to select the relevant class before taking the absolute value:
    # shap_abs_values = np.abs(shap_values[:, 0])  # For example, for the first class
    print(shap_abs_values)
    top_features_indices = np.argsort(shap_abs_values)[-top_n:]

    # Generate a simple explanation
    report = "This transaction was flagged as fraud due to: "
    print(top_features_indices)
    for i in reversed(top_features_indices):
        feature_name = features.columns[i]
        report += f"{feature_name} with an influence of {shap_abs_values[i]:.4f}, "

    return report[:-2] + "."

# Example explanation for a fraud prediction
# Access the SHAP values for the first prediction and desired class
fraud_shap_values = shap_values[1, :] # Get SHAP values for all features for the first prediction
# Or if you want to use all classes:
# fraud_shap_values = shap_values[1]  # Get SHAP values for all features and classes for the first prediction

explanation = generate_nlp_report(fraud_shap_values, X_test)
print(explanation)

In [None]:
# Define feature descriptions
feature_descriptions = {
    "step": "Anomalous Timing of Transactions",
    "type" : "High-Risk Transaction Type",
    "amount": "Unusual Transaction Amount",
    "nameOrig": "Suspicious Originating Account",
    "oldbalanceOrg": "Dramatic Balance Decrease",
    "newbalanceOrig": "Dangerously Low Balance Post-Transaction",
    "nameDest": "Transactions to Unverified Accounts",
    "oldbalanceDest": "Low Initial Balance in Destination Account",
    "newbalanceDest": "Significant Increase in Destination Balance"
}

def extract_top_reasons(shap_values, feature_names, top_n=3):
    """
    Extracts the top N reasons contributing to a fraud prediction based on SHAP values.

    Parameters:
    - shap_values: SHAP values for the instance.
    - feature_names: List of feature names.
    - top_n: Number of top features to extract.

    Returns:
    - List of reason descriptions.
    """
    # Calculate absolute SHAP values
    shap_abs = np.abs(shap_values.values)

    # Get indices of top N features
    top_indices = np.argsort(shap_abs)[-top_n:]

    # Map feature names to descriptions
    reasons = [feature_descriptions.get(feature_names[i], feature_names[i]) for i in top_indices]

    return reasons


# 1. go to https://aistudio.google.com/app/apikey
# 2. create API key
# 3. put in code

In [None]:
import google.generativeai as genai
import os
def generate_from_gimini(resones,name):
    genai.configure(api_key='')
    model = genai.GenerativeModel("gemini-1.5-flash")
    text = ""
    for x in resones:
        text = text + x + " "
    promet=f"""{text} 
    take those tags and make email that is from a bank that tell the customer {name} that his transaction is a fraud based on those tags"""
    response = model.generate_content(promet)
    print(response.text)
    return response.text

In [None]:
# Find indices of fraudulent transactions in the test set
fraud_indices = np.where(y_test.values == 1)[0]

if len(fraud_indices) == 0:
    print("No fraudulent transactions detected in the test set.")
else:
    # Select the first fraudulent transaction
    fraud_index = fraud_indices[0]

    # Get the SHAP values for this transaction
    transaction_shap_values = shap_values[1, :]  # shap_values[1] corresponds to the positive class (fraud)

    # Get the feature names
    feature_names = X_test.columns

    # Extract top reasons
    top_reasons = extract_top_reasons(transaction_shap_values, feature_names, top_n=3)

    # Generate the email
    customer_name = "John Doe"  # Replace with actual customer name as needed
    email_content = generate_from_gimini(top_reasons, customer_name)

    print("\n--- Generated Fraud Detection Email ---\n")
    print(email_content)
