In [1]:
import pandas as pd
import numpy as np

# Simulate data
np.random.seed(42)
data_size = 1000
data = {
    'TransactionAmount': np.random.uniform(50, 5000, data_size),
    'FrequencyOfTransactions': np.random.randint(1, 50, data_size),
    'IsNewDevice': np.random.choice([0, 1], data_size, p=[0.8, 0.2]),
    'IsNewLocation': np.random.choice([0, 1], data_size, p=[0.7, 0.3]),
    'PaymentMethod': np.random.choice(['card', 'wallet', 'bank_transfer'], data_size),
    'TransactionHour': np.random.randint(0, 24, data_size),
    'UserAgeDays': np.random.randint(1, 365, data_size),
    'PreviousFraudReports': np.random.choice([0, 1], data_size, p=[0.9, 0.1]),
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,TransactionAmount,FrequencyOfTransactions,IsNewDevice,IsNewLocation,PaymentMethod,TransactionHour,UserAgeDays,PreviousFraudReports
0,1903.973588,47,1,0,bank_transfer,22,66,0
1,4756.035817,12,0,0,card,12,234,0
2,3673.370012,16,0,0,bank_transfer,5,87,0
3,3013.359497,24,0,1,wallet,5,79,0
4,822.29227,19,0,0,bank_transfer,11,205,0


# Preprocess Data

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

features = df.columns.tolist()
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = ['PaymentMethod']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

df_processed = preprocessor.fit_transform(df)

# Clustering to Identify Risk Buckets

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=42)
df['RiskBucket'] = kmeans.fit_predict(df_processed)

# Check the distribution of users in buckets
print(df['RiskBucket'].value_counts())

# Prioritizing Alerts Based on Risk Buckets

In [None]:
# Example: Assuming 'AlertScore' is a column in your alerts DataFrame
alerts_df['Priority'] = alerts_df.apply(lambda x: 'High' if x['RiskBucket'] in [1, 2] else 'Low', axis=1)

# Now, manual reviewers can filter by 'Priority' to focus on high-priority alerts first

# Silhouette method for optimal clusters

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Assuming df_processed is the preprocessed DataFrame

# We will test a range of cluster numbers to find the optimal one
range_n_clusters = list(range(2, 11))
silhouette_scores = []

for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = clusterer.fit_predict(df_processed)
    silhouette_avg = silhouette_score(df_processed, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

# Plotting the silhouette scores for different numbers of clusters
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, silhouette_scores, marker='o')
plt.title('Silhouette Score Method For Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()