In [4]:
# Data Handling
import pandas as pd
import numpy as np

# Data Visualisation
import plotly.express as px
import seaborn as sns
from pivottablejs import pivot_ui

# Clustering & Machine Learning
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import IsolationForest

# Time Series Analysis
import statsmodels.api as sm
import plotly.figure_factory as ff
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


# Warnings
import warnings
warnings.filterwarnings("ignore")

print("All libraries installed and working!")

All libraries installed and working!


In [5]:
df = pd.read_csv("../data/F_EXPORT_Cleaned_TPA_data_Final.csv")


df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   transaction_ID                 157 non-null    object 
 1   date                           157 non-null    object 
 2   Amount_GBP                     157 non-null    float64
 3   payment_method                 157 non-null    object 
 4   merchant_category              157 non-null    object 
 5   location                       157 non-null    object 
 6   customer_segment               157 non-null    object 
 7   transaction_status             157 non-null    object 
 8   sales_channel                  157 non-null    object 
 9   customer_device_type           157 non-null    object 
 10  promotions_applied             157 non-null    object 
 11  promotion_or_discount_applied  157 non-null    object 
 12  time                           89 non-null     obj

Unnamed: 0,transaction_ID,date,Amount_GBP,payment_method,merchant_category,location,customer_segment,transaction_status,sales_channel,customer_device_type,promotions_applied,promotion_or_discount_applied,time,transaction_type,Unnamed: 14,Unnamed: 15
0,T000164,02/11/2023,-100.0,Debit Card,Entertainment,Glasgow,46-60,Failed,In-Store,Smartwatch,No,No,,Failed Refund,,
1,T000118,02/11/2023,0.0,Debit Card,Entertainment,Glasgow,46-60,Failed,In-Store,Smartwatch,Yes,Loyalty Points Redeemed,,Failed Transaction,,
2,T000138,02/11/2023,0.0,Debit Card,Entertainment,Glasgow,46-60,Failed,In-Store,Smartwatch,Yes,Loyalty Points Redeemed,,Failed Transaction,,
3,T000158,02/11/2023,0.0,Debit Card,Entertainment,Glasgow,46-60,Failed,In-Store,Smartwatch,Yes,Loyalty Points Redeemed,,Failed Transaction,,
4,T000178,02/11/2023,0.0,Debit Card,Entertainment,Glasgow,46-60,Failed,In-Store,Smartwatch,Yes,Loyalty Points Redeemed,,Failed Transaction,,


In [6]:
from pivottablejs import pivot_ui

pivot_ui(df)

In [7]:
df_time_series = df.groupby('date')['Amount_GBP'].sum().reset_index()

fig = px.line(df_time_series, x='date', y='Amount_GBP', title="Transaction Trends Over Time",
              labels={'Amount_GBP': 'Total Transaction Amount (£)', 'date': 'Date'},
              markers=True)

fig.update_xaxes(title_text="Date", rangeslider_visible=True)
fig.update_yaxes(title_text="Total Transaction Amount (£)")
fig.show()

In [8]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

df_filtered = df[df['date'] != pd.Timestamp("2023-11-02")]

df_time_series = df_filtered.groupby('date')['Amount_GBP'].sum().reset_index()

fig = px.line(df_time_series, x='date', y='Amount_GBP', title="Transaction Trends Over Time (Excluding 02/11/2023)",
              labels={'Amount_GBP': 'Total Transaction Amount (£)', 'date': 'Date'},
              markers=True)

fig.update_xaxes(title_text="Date", rangeslider_visible=True)
fig.update_yaxes(title_text="Total Transaction Amount (£)")

fig.show()

The above looks more consistent and logical. However, I will proceed with a larger dataset for analysis on payment_method, merchant_category, location, customer_segment, transaction_status, sales_channel, customer_device_type, and promotion_or_discount_applied.

In [9]:
avg_amount_by_payment = df.groupby('payment_method')['Amount_GBP'].mean().reset_index()

fig = px.bar(
    avg_amount_by_payment,
    x='payment_method',
    y='Amount_GBP',
    title='Average Transaction Amount by Payment Method',
    labels={'Amount_GBP': 'Average Amount (GBP)'}
)

fig.show()

In [10]:
fig = px.sunburst(
    df,
    path=['merchant_category', 'location'],
    values='Amount_GBP',
    title='Transaction Distribution by Merchant Category and Location',
    color='merchant_category'
)

fig.show()

In [11]:
fig = px.parallel_categories(
    df,
    dimensions=['customer_segment', 'payment_method', 'transaction_status'],
    color='Amount_GBP',
    title='Customer Segment, Payment Method, and Transaction Status Relationship'
)

fig.show()

In [12]:
fig = px.box(
    df,
    x='customer_segment',
    y='Amount_GBP',
    title='Transaction Amount Distribution by Customer Segment',
    labels={'Amount_GBP': 'Transaction Amount (GBP)'}
)

fig.show()

In [13]:

fig = px.violin(
    df,
    x='customer_segment',
    y='Amount_GBP',
    box=True, 
    points='outliers', 
    color='customer_segment',  
    title='Transaction Amount Distribution by Customer Segment',
    labels={'Amount_GBP': 'Transaction Amount (GBP)'},
    hover_data=['payment_method', 'merchant_category'] 

fig.show()

SyntaxError: '(' was never closed (2315334732.py, line 1)

The anomaly in the data is highlighted further outside the data column, specifically focusing on the 26-35 age group. I am still confident that professional success could logically account for this variation. I will now move past TA focuse.

In [14]:
success_rates = df.groupby('customer_device_type')['transaction_status'].value_counts(normalize=True).unstack() * 100

fig = px.bar(
    success_rates,
    x=success_rates.index,
    y=['Completed', 'Failed', 'Pending', 'Chargeback', 'Refunded'],
    title='Transaction Success Rate by Device Type',
    labels={'value': 'Percentage (%)', 'customer_device_type': 'Device Type'},
    barmode='group'
)

fig.show()

success_rates = df.groupby('customer_device_type')['transaction_status'].value_counts(normalize=True).unstack() * 100

print("🔹 Transaction Success Rate by Device Type:\n")
for device, row in success_rates.iterrows():
    print(f"📱 {device}:")
    for status, value in row.items():
        print(f"   - {status}: {value:.2f}%")
    print("\n")

🔹 Transaction Success Rate by Device Type:

📱 Desktop:
   - Chargeback: 5.56%
   - Completed: 47.22%
   - Failed: 16.67%
   - Pending: 30.56%
   - Refunded: nan%


📱 Mobile:
   - Chargeback: 2.38%
   - Completed: 84.52%
   - Failed: 2.38%
   - Pending: 3.57%
   - Refunded: 7.14%


📱 Smartwatch:
   - Chargeback: 32.14%
   - Completed: 21.43%
   - Failed: 39.29%
   - Pending: 7.14%
   - Refunded: nan%


📱 Tablet:
   - Chargeback: 22.22%
   - Completed: 11.11%
   - Failed: nan%
   - Pending: 55.56%
   - Refunded: 11.11%




In [15]:
payment_status_counts = df.groupby('payment_method')['transaction_status'].value_counts().unstack()

fig = px.bar(
    payment_status_counts,
    x=payment_status_counts.index,
    y=['Completed', 'Failed', 'Pending', 'Chargeback', 'Refunded'],
    title='Payment Method vs Transaction Status',
    labels={'value': 'Count', 'payment_method': 'Payment Method'},
    barmode='stack'
)

fig.show()

payment_status_counts = df.groupby('payment_method')['transaction_status'].value_counts().unstack()

print("🔹 Payment Method vs. Transaction Status:\n")
for method, row in payment_status_counts.iterrows():
    print(f"💳 {method}:")
    for status, value in row.items():
        print(f"   - {status}: {value} transactions")
    print("\n")

🔹 Payment Method vs. Transaction Status:

💳 Bank Transfer:
   - Chargeback: 2.0 transactions
   - Completed: 2.0 transactions
   - Failed: 5.0 transactions
   - Pending: 8.0 transactions
   - Refunded: 1.0 transactions


💳 Credit Card:
   - Chargeback: nan transactions
   - Completed: 29.0 transactions
   - Failed: nan transactions
   - Pending: nan transactions
   - Refunded: 1.0 transactions


💳 Credit card:
   - Chargeback: nan transactions
   - Completed: 5.0 transactions
   - Failed: nan transactions
   - Pending: nan transactions
   - Refunded: nan transactions


💳 Debit Card:
   - Chargeback: 3.0 transactions
   - Completed: 4.0 transactions
   - Failed: 8.0 transactions
   - Pending: nan transactions
   - Refunded: nan transactions


💳 Digital Wallet:
   - Chargeback: 8.0 transactions
   - Completed: 39.0 transactions
   - Failed: 2.0 transactions
   - Pending: 4.0 transactions
   - Refunded: 5.0 transactions


💳 Mobile Payment:
   - Chargeback: 2.0 transactions
   - Completed:

In [16]:
customer_segment_status = df.groupby('customer_segment')['transaction_status'].value_counts(normalize=True).unstack() * 100

fig = px.bar(
    customer_segment_status,
    x=customer_segment_status.index,
    y=['Completed', 'Failed', 'Pending', 'Chargeback', 'Refunded'],
    title='Transaction Success by Customer Segment',
    labels={'value': 'Percentage (%)', 'customer_segment': 'Customer Segment'},
    barmode='group'
)

fig.show()

customer_segment_status = df.groupby('customer_segment')['transaction_status'].value_counts(normalize=True).unstack() * 100

print("🔹 Transaction Success by Customer Segment:\n")
for segment, row in customer_segment_status.iterrows():
    print(f"👥 {segment}:")
    for status, value in row.items():
        print(f"   - {status}: {value:.2f}%")
    print("\n")

🔹 Transaction Success by Customer Segment:

👥 18-25:
   - Chargeback: 4.88%
   - Completed: 56.10%
   - Failed: 9.76%
   - Pending: 17.07%
   - Refunded: 12.20%


👥 26-35:
   - Chargeback: nan%
   - Completed: 97.14%
   - Failed: nan%
   - Pending: nan%
   - Refunded: 2.86%


👥 36-45:
   - Chargeback: 2.94%
   - Completed: 85.29%
   - Failed: 5.88%
   - Pending: 5.88%
   - Refunded: nan%


👥 46-60:
   - Chargeback: 34.48%
   - Completed: 24.14%
   - Failed: 27.59%
   - Pending: 13.79%
   - Refunded: nan%


👥 60+:
   - Chargeback: 11.11%
   - Completed: 11.11%
   - Failed: 27.78%
   - Pending: 44.44%
   - Refunded: 5.56%




Interestingm failed payment categories appear more frequently in the older age bracket.

In [17]:
sales_channel_counts = df['sales_channel'].value_counts()

sales_channel_counts = sales_channel_counts.loc[['Online', 'In-Store', 'Subscription']]

fig = px.pie(
    names=sales_channel_counts.index,
    values=sales_channel_counts.values,
    title="Online vs In-Store vs Subscription by Transaction Count",
    labels={'names': 'Sales Channel', 'values': 'Transaction Count'}
)

fig.show()

print("🔹 Sales Channel Distribution:\n")
for channel, count in sales_channel_counts.items():
    print(f"🛒 {channel}: {count} transactions")

sales_channel_counts.to_json('sales_channel_distribution.json', indent=4)

print("\n✅ Data saved to 'sales_channel_distribution.json'")

🔹 Sales Channel Distribution:

🛒 Online: 58 transactions
🛒 In-Store: 35 transactions
🛒 Subscription: 18 transactions

✅ Data saved to 'sales_channel_distribution.json'


In [18]:
device_segment_counts = df.groupby('customer_segment')['customer_device_type'].value_counts().unstack()

fig = px.bar(
    device_segment_counts,
    x=device_segment_counts.index,
    y=device_segment_counts.columns,
    title='Customer Segment vs. Preferred Device Type',
    labels={'value': 'Count', 'customer_segment': 'Customer Segment'},
    barmode='stack'
)

fig.show()
device_segment_counts = df.groupby('customer_segment')['customer_device_type'].value_counts().unstack()

print("🔹 Customer Segment vs. Preferred Device Type:\n")
for segment, row in device_segment_counts.iterrows():
    print(f"👤 {segment}:")
    for device, value in row.items():
        print(f"   - {device}: {value} users")
    print("\n")

🔹 Customer Segment vs. Preferred Device Type:

👤 18-25:
   - Desktop: 21.0 users
   - Mobile: 20.0 users
   - Smartwatch: nan users
   - Tablet: nan users


👤 26-35:
   - Desktop: nan users
   - Mobile: 35.0 users
   - Smartwatch: nan users
   - Tablet: nan users


👤 36-45:
   - Desktop: 10.0 users
   - Mobile: 24.0 users
   - Smartwatch: nan users
   - Tablet: nan users


👤 46-60:
   - Desktop: nan users
   - Mobile: 5.0 users
   - Smartwatch: 24.0 users
   - Tablet: nan users


👤 60+:
   - Desktop: 5.0 users
   - Mobile: nan users
   - Smartwatch: 4.0 users
   - Tablet: 9.0 users




**Final Thoughts**
1. Mobile devices remain the most reliable payment platform, with high completion rates and low failures.
2. Older customers and smartwatch users face transaction reliability issues, requiring usability improvements.
3. Digital wallets offer convenience but come with a high chargeback risk.
4. Online payments lead the market, but physical transactions still play a major role.
5. Payment providers should improve the reliability of smartwatch/tablet payments and offer better fraud protection for digital wallet transactions.

# K-Means (clustering)

In [5]:
import sys
print(sys.executable)

/Users/jackrobertson/TPA_CleanAndAnalysis/tpa-analysis/bin/python


In [8]:
from sklearn_extra.cluster import KMedoids
print("KMedoids is working!")

KMedoids is working!


In [12]:
import pandas as pd
import plotly.express as px
import json
from sklearn.preprocessing import StandardScaler
from sklearn_extra.cluster import KMedoids  

# Load dataset
df_cluster = pd.read_csv("../data/F_EXPORT_Cleaned_TPA_data_Final.csv")

# Define relevant features for clustering
features = ['Amount_GBP', 'customer_segment', 'payment_method', 
            'sales_channel', 'transaction_status', 
            'customer_device_type', 'promotions_applied']

df_cluster = df_cluster[features].copy()

# One-hot encode categorical variables (drop first to avoid collinearity)
df_cluster = pd.get_dummies(df_cluster, drop_first=True)

# Scale numerical data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_cluster)

# Optimal number of clusters (can be tuned)
optimal_k = 5

# Apply K-Medoids clustering
kmedoids = KMedoids(n_clusters=optimal_k, metric="manhattan", random_state=42)
df_cluster['Cluster'] = kmedoids.fit_predict(df_scaled)

# Retrieve original customer segment for visualization
df_original = pd.read_csv("../data/F_EXPORT_Cleaned_TPA_data_Final.csv")
df_cluster['Customer Segment'] = df_original['customer_segment']

# Count transactions per cluster
cluster_counts = df_cluster['Cluster'].value_counts().reset_index()
cluster_counts.columns = ['Cluster', 'Transaction Count']

# Merge transaction counts back into dataset
df_cluster = df_cluster.merge(cluster_counts, on="Cluster", how="left")

# Save cluster data to CSV for review
df_cluster.to_csv("clustered_data.csv", index=False)

# Print first few rows of clustered data for analysis
print("📊 Clustered Data Sample:")
print(df_cluster.head())

# Print cluster counts to understand distribution
print("\n🔍 Cluster Distribution:")
print(cluster_counts)

# Create scatter plot
fig = px.scatter(
    df_cluster, 
    x="Amount_GBP", 
    y="Customer Segment",
    size="Transaction Count",  
    color=df_cluster["Cluster"].astype(str),
    title="K-Medoids Clustering: Customer Segments & Spending Behaviour",
    labels={"Amount_GBP": "Transaction Amount (GBP)", "Customer Segment": "Customer Segment", "Cluster": "Cluster Group"},
    hover_data={"Cluster": True} 
)

# Prepare data for Recharts
recharts_data = [
    {
        "x": row["Amount_GBP"],
        "y": row["Customer Segment"],
        "size": row["Transaction Count"],  
        "cluster": int(row["Cluster"])
    }
    for _, row in df_cluster.iterrows()
]

# Save to JSON for Recharts
with open("recharts_data.json", "w") as f:
    json.dump(recharts_data, f, indent=4)

print(f"✅ Exported {len(recharts_data)} data points to 'recharts_data.json' for Recharts")
print("✅ Clustered data saved to 'clustered_data.csv' for review.")

# Show the plot
fig.show()

📊 Clustered Data Sample:
   Amount_GBP  customer_segment_26-35  customer_segment_36-45  \
0      -100.0                   False                   False   
1         0.0                   False                   False   
2         0.0                   False                   False   
3         0.0                   False                   False   
4         0.0                   False                   False   

   customer_segment_46-60  customer_segment_60+  payment_method_Credit Card  \
0                    True                 False                       False   
1                    True                 False                       False   
2                    True                 False                       False   
3                    True                 False                       False   
4                    True                 False                       False   

   payment_method_Credit card  payment_method_Debit Card  \
0                       False                    

In [13]:
import pandas as pd
import json

# Load clustered data
df = pd.read_csv("clustered_data.csv")  # Ensure this is the correct file

# Ensure the necessary columns exist
columns_needed = ["Amount_GBP", "Customer Segment", "Transaction Count", "Cluster"]
missing_cols = [col for col in columns_needed if col not in df.columns]

if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

# Convert data into Recharts format
recharts_data = [
    {
        "x": row["Amount_GBP"],  # Transaction Amount
        "y": str(row["Customer Segment"]),  # Customer Age Group
        "size": int(row["Transaction Count"]),  # Transaction Count for bubble size
        "cluster": int(row["Cluster"])  # Cluster ID
    }
    for _, row in df.iterrows()
]

# Save to JSON
output_file = "recharts_data.json"
with open(output_file, "w") as f:
    json.dump(recharts_data, f, indent=4)

print(f"✅ Exported {len(recharts_data)} data points to '{output_file}' for Recharts")

✅ Exported 157 data points to 'recharts_data.json' for Recharts


## Findings

1. Older in-store debit card users have high chargebacks—investigate dispute causes.
2. Digital-first credit card users are the most stable—consider VIP rewards.
3. Seniors using bank transfers face delays—improve processing efficiency.
4. Mobile wallet adoption is strong in physical stores—support NFC growth.
5. Young users engage through apps—drive mobile loyalty programs.

## Findings

1. Older in-store debit card users have high chargebacks—investigate dispute causes.
2. Digital-first credit card users are the most stable—consider VIP rewards.
3. Seniors using bank transfers face delays—improve processing efficiency.
4. Mobile wallet adoption is strong in physical stores—support NFC growth.
5. Young users engage through apps—drive mobile loyalty programs.

In [20]:

import pandas as pd

def compute_key_metrics(input_file='../data/F_EXPORT_Cleaned_TPA_data_Final.csv', output_file='key_metrics_summary.csv'):
    # 1) Read data
    df = pd.read_csv(input_file)
    
    # 2) Ensure 'date' is a datetime so we can handle time-based calculations
    df['date'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')  
    # 'dayfirst=True' is optional, adjust if your CSV dates are in a different format
    
    # --- METRIC 1: Transaction Success Rate (Completed vs Failed vs Chargebacks) ---
    total_transactions = len(df)
    
    # Completed
    completed_count = len(df[df['transaction_status'] == 'Completed'])
    completed_rate = (completed_count / total_transactions) * 100
    
    # Failed
    failed_count = len(df[df['transaction_status'] == 'Failed'])
    failed_rate = (failed_count / total_transactions) * 100
    
    # Chargeback
    chargeback_count = len(df[df['transaction_status'] == 'Chargeback'])
    chargeback_rate = (chargeback_count / total_transactions) * 100
    
    # --- METRIC 2: Most Reliable Payment Method (Highest Success Rate) ---
    #    We'll calculate success rate per payment method = #Completed / #All for that method
    payment_group = df.groupby('payment_method')
    
    reliability_data = []
    for method, group_df in payment_group:
        method_total = len(group_df)
        method_completed = len(group_df[group_df['transaction_status'] == 'Completed'])
        method_success_rate = 0
        if method_total > 0:
            method_success_rate = (method_completed / method_total) * 100
        
        reliability_data.append((method, method_success_rate))
    
    # Sort by descending success rate
    reliability_data.sort(key=lambda x: x[1], reverse=True)
    
    # (From the prompt, we expect "Mobile Payment" to be the top, but we’ll verify with real data)
    most_reliable_payment_method, top_success_rate = reliability_data[0]
    
    # --- METRIC 3: Highest-Risk Device (by chargeback rate) ---
    #    Chargeback rate per device = #Chargebacks / #All for that device
    device_group = df.groupby('customer_device_type')
    
    device_chargeback_data = []
    for device, group_df in device_group:
        device_total = len(group_df)
        device_chargebacks = len(group_df[group_df['transaction_status'] == 'Chargeback'])
        device_chargeback_rate = 0
        if device_total > 0:
            device_chargeback_rate = (device_chargebacks / device_total) * 100
        device_chargeback_data.append((device, device_chargeback_rate))
    
    # Sort devices by descending chargeback rate
    device_chargeback_data.sort(key=lambda x: x[1], reverse=True)
    
    highest_risk_device, highest_device_chargeback_rate = device_chargeback_data[0]
    
    # --- METRIC 4: Most Active Customer Segment (the one with the most transactions) ---
    segment_counts = df['customer_segment'].value_counts()
    most_active_segment = segment_counts.index[0]  # Top segment name
    most_active_segment_count = segment_counts.iloc[0]  # How many transactions
    
    # --- METRIC 5: Growth Opportunity: In-store Digital Wallets 
    #     We'll approximate "growth" as comparing the earliest month vs latest month usage
    #     for (payment_method=='Digital Wallet' AND sales_channel=='In-Store')
    
    # Filter in-store digital wallets
    in_store_wallets = df[
        (df['payment_method'] == 'Digital Wallet') &
        (df['sales_channel'] == 'In-Store')
    ].copy()
    
    # Ensure we have valid dates
    in_store_wallets = in_store_wallets.dropna(subset=['date'])
    if not in_store_wallets.empty:
        # Create a Year-Month column to group by monthly usage
        in_store_wallets['year_month'] = in_store_wallets['date'].dt.to_period('M').astype(str)
        
        usage_by_month = in_store_wallets.groupby('year_month')['transaction_ID'].count().reset_index()
        usage_by_month.rename(columns={'transaction_ID': 'count'}, inplace=True)
        
        # Sort by chronological order
        usage_by_month.sort_values(by='year_month', inplace=True)
        
        # We'll define "growth" as ((last_count - first_count) / first_count) * 100
        if len(usage_by_month) >= 2:
            first_month_count = usage_by_month.iloc[0]['count']
            last_month_count = usage_by_month.iloc[-1]['count']
            
            if first_month_count > 0:
                growth_in_instore_wallets = ((last_month_count - first_month_count) / first_month_count) * 100
            else:
                # If the first month had 0 usage, handle division by zero
                growth_in_instore_wallets = float('inf')  # or 0, depending on preference
        else:
            # If we have only one month or none, we can’t measure growth across months
            growth_in_instore_wallets = 0
    else:
        # No in-store wallet usage data at all
        growth_in_instore_wallets = 0
    
    # --- Prepare final metrics dictionary for exporting ---
    metrics_dict = {
        "Transaction_Success_Rate_Completed_%": round(completed_rate, 2),
        "Transaction_Success_Rate_Failed_%": round(failed_rate, 2),
        "Transaction_Success_Rate_Chargeback_%": round(chargeback_rate, 2),
        "Most_Reliable_Payment_Method": most_reliable_payment_method,
        "Method_Success_Rate_%": round(top_success_rate, 2),
        "Highest_Risk_Device": highest_risk_device,
        "Device_Chargeback_Rate_%": round(highest_device_chargeback_rate, 2),
        "Most_Active_Customer_Segment": most_active_segment,
        "Segment_Transaction_Count": most_active_segment_count,
        "InStore_DigitalWallet_Growth_%": round(growth_in_instore_wallets, 2)
    }
    
    # Convert to a single-row DataFrame for export
    metrics_df = pd.DataFrame([metrics_dict])
    metrics_df.to_csv(output_file, index=False)
    
    print("Key metrics have been computed and saved to:", output_file)
    print(metrics_df.T)  # Print them in a vertical format

# Run the function
compute_key_metrics("../data/F_EXPORT_Cleaned_TPA_data_Final.csv", "key_metrics_summary.csv")

Key metrics have been computed and saved to: key_metrics_summary.csv
                                                 0
Transaction_Success_Rate_Completed_%         60.51
Transaction_Success_Rate_Failed_%             12.1
Transaction_Success_Rate_Chargeback_%         9.55
Most_Reliable_Payment_Method           Credit card
Method_Success_Rate_%                        100.0
Highest_Risk_Device                     Smartwatch
Device_Chargeback_Rate_%                     32.14
Most_Active_Customer_Segment                 18-25
Segment_Transaction_Count                       41
InStore_DigitalWallet_Growth_%               900.0


In [21]:
import pandas as pd
import json

# Example loading your data (adjust path as needed)
df_original = pd.read_csv("../data/F_EXPORT_Cleaned_TPA_data_Final.csv")

# Then build the final Recharts data
recharts_data = []
for _, row in df_merged.iterrows():
    recharts_data.append({
        "x": row["Amount_GBP"],          # transaction amount 
        "y": row["customer_segment"],    # or rename this to "age_group"
        "size": row["Transaction Count"], 
        "cluster": int(row["Cluster"]),  # use the cluster number in place of any "customer_score"
    })

with open("recharts_data.json", "w") as f:
    json.dump(recharts_data, f, indent=4)

print(f"✅ Exported {len(recharts_data)} data points.")

NameError: name 'df_merged' is not defined

# Game data

In [22]:
import pandas as pd

# Replace 'transactions.csv' with your file name or path
df = pd.read_csv('../data/F_EXPORT_Cleaned_TPA_data_Final.csv')

# Convert the DataFrame to a JSON string
json_str = df.to_json(orient='records')

# Optionally save to a JSON file
with open('transactions.json', 'w', encoding='utf-8') as f:
    f.write(json_str)