In [1]:
columns=['Count', 'sender', 'receiver', 'amount', 'currency','Mt','year','direction','category','send region','receiver region','mx/mt']

In [2]:

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

np.random.seed(42)

num_records = 10000

banks = [
    "Citibank", "HSBC", "JPMorgan Chase", "Bank of America", "Standard Chartered",
    "Deutsche Bank", "BNP Paribas", "Barclays", "Credit Suisse", "UBS",
    "Emirates NBD", "Abu Dhabi Commercial Bank", "First Abu Dhabi Bank", "Dubai Islamic Bank",
    "Mashreq Bank", "Bank of China", "Industrial and Commercial Bank of China", "Wells Fargo",
    "Goldman Sachs", "Morgan Stanley", "Royal Bank of Canada", "Bank of Tokyo-Mitsubishi UFJ"
]

uae_banks = [
    "Emirates NBD", "Abu Dhabi Commercial Bank", "First Abu Dhabi Bank", 
    "Dubai Islamic Bank", "Mashreq Bank", "Commercial Bank of Dubai", 
    "Abu Dhabi Islamic Bank", "RAK Bank", "National Bank of Fujairah",
    "Sharjah Islamic Bank", "United Arab Bank", "National Bank of Umm Al Qaiwain"
]

currencies = ["USD", "EUR", "GBP", "AED", "JPY", "CHF", "CNY", "CAD", "AUD", "SGD"]
mt_types = ["MT103", "MT202", "MT700", "MT760", "MT799", "MT910", "MT940", "MT950"]
directions = ["Outgoing", "Incoming"]
categories = ["Trade Finance", "Corporate Payment", "Retail Payment", "Interbank Settlement", 
              "Treasury Operations", "Investment", "Loan Disbursement", "FX Settlement"]

regions = [
    "North America", "Europe", "Asia Pacific", "Middle East", "Africa", 
    "Latin America", "Caribbean", "Central Asia", "South Asia", "Southeast Asia"
]

def random_date(start_year=2015, end_year=2023):
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    return start_date + timedelta(days=random_number_of_days)


data = []
for i in range(num_records):
    # Determine if receiver is UAE-based (75% probability)
    is_uae_receiver = np.random.choice([True, False], p=[0.75, 0.25])
    
    
    sender = np.random.choice(banks)
    if is_uae_receiver:
        receiver = np.random.choice(uae_banks)
        receiver_region = "Middle East"  # UAE is in Middle East
    else:
        
        possible_receivers = [bank for bank in banks if bank != sender]
        receiver = np.random.choice(possible_receivers)
        receiver_region = np.random.choice([r for r in regions if r != "Middle East"])
    
    count = i + 1
    amount = round(np.random.lognormal(mean=10, sigma=1.5), 2)  
    currency = np.random.choice(currencies, p=[0.4, 0.2, 0.1, 0.15, 0.05, 0.02, 0.03, 0.02, 0.02, 0.01])  # USD most common
    mt = np.random.choice(mt_types)
    year = random_date().year
    direction = np.random.choice(directions)
    category = np.random.choice(categories)
    send_region = np.random.choice(regions)
    mx_mt = "MT"  # All are MT in this simulation
    
    # Add to data
    data.append([
        count, sender, receiver, amount, currency, mt, year, direction, 
        category, send_region, receiver_region, mx_mt
    ])

columns = ['Count', 'sender', 'receiver', 'amount', 'currency', 'Mt', 'year', 
           'direction', 'category', 'send region', 'receiver region', 'mx/mt']
df = pd.DataFrame(data, columns=columns)

for idx, row in df.iterrows():
    if row['currency'] == 'JPY':
        df.at[idx, 'amount'] = round(row['amount'] * 100)  
    elif row['currency'] == 'AED':
        df.at[idx, 'amount'] = round(row['amount'] * 3.67, 2)  
    elif row['currency'] == 'EUR':
        df.at[idx, 'amount'] = round(row['amount'] * 0.85, 2)
    elif row['currency'] == 'GBP':
        df.at[idx, 'amount'] = round(row['amount'] * 0.75, 2)





df


Unnamed: 0,Count,sender,receiver,amount,currency,Mt,year,direction,category,send region,receiver region,mx/mt
0,1,Mashreq Bank,United Arab Bank,38111.53,EUR,MT940,2018,Outgoing,Retail Payment,Central Asia,Middle East,MT
1,2,Barclays,RAK Bank,128668.30,GBP,MT799,2019,Incoming,FX Settlement,Latin America,Middle East,MT
2,3,Royal Bank of Canada,Emirates NBD,26914.61,EUR,MT103,2021,Outgoing,Retail Payment,Southeast Asia,Middle East,MT
3,4,Bank of China,First Abu Dhabi Bank,1248.98,USD,MT760,2020,Incoming,Trade Finance,Asia Pacific,Middle East,MT
4,5,BNP Paribas,Bank of Tokyo-Mitsubishi UFJ,6710.04,GBP,MT940,2019,Incoming,Loan Disbursement,Central Asia,Southeast Asia,MT
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,Bank of America,Citibank,99487.94,USD,MT700,2017,Outgoing,Loan Disbursement,North America,Europe,MT
9996,9997,Emirates NBD,United Arab Bank,701.25,AUD,MT760,2017,Outgoing,Corporate Payment,South Asia,Middle East,MT
9997,9998,Abu Dhabi Commercial Bank,First Abu Dhabi Bank,56986.04,AED,MT760,2015,Outgoing,FX Settlement,Caribbean,Middle East,MT
9998,9999,Abu Dhabi Commercial Bank,Sharjah Islamic Bank,259446.92,GBP,MT700,2020,Outgoing,Trade Finance,Latin America,Middle East,MT


In [14]:

import pandas as pd
import plotly.graph_objects as go
import numpy as np
import colorsys



def create_hierarchical_sankey(df):
    
    categories = ['currency','category','year','Mt','send region', 'sender', 'receiver', 'receiver region', 'direction']
    
    
    mappings = {}
    offset = 0
    
    for category in categories:
        unique_values = df[category].unique()
        mappings[category] = {value: i + offset for i, value in enumerate(unique_values)}
        offset += len(unique_values)
    
    all_nodes = []
    node_colors = []
    
    def generate_colors(n):
        colors = []
        for i in range(n):
            hue = i / n
            # Use a medium lightness and saturation for neutral but distinct colors
            lightness = 0.6
            saturation = 0.5
            r, g, b = [int(255 * x) for x in colorsys.hls_to_rgb(hue, lightness, saturation)]
            colors.append(f'rgba({r},{g},{b},0.8)')
        return colors
    
    # Generate node labels and collect counts for each node
    node_counts = {}
    
    for category in categories:
        unique_values = df[category].unique()
        category_colors = generate_colors(len(unique_values))
        
        for i, value in enumerate(unique_values):
            node_label = f"{category}: {value}"
            all_nodes.append(node_label)
            node_colors.append(category_colors[i])
            
            # Count occurrences for each node
            count = df[df[category] == value].shape[0]
            node_counts[node_label] = count
    
    node_labels = [f"{label} ({node_counts[label]})" for label in all_nodes]
    
    sources = []
    targets = []
    values = []
    link_colors = []
    
    for i in range(len(categories) - 1):
        source_category = categories[i]
        target_category = categories[i+1]
        
        grouped = df.groupby([source_category, target_category]).size().reset_index(name='count')
        
        for _, row in grouped.iterrows():
            source_value = row[source_category]
            target_value = row[target_category]
            count = row['count']
            
            source_index = list(all_nodes).index(f"{source_category}: {source_value}")
            target_index = list(all_nodes).index(f"{target_category}: {target_value}")
            
            sources.append(source_index)
            targets.append(target_index)
            values.append(count)
            
            
            link_colors.append(node_colors[source_index])
    
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=45,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=node_labels,
            color=node_colors
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
            color=link_colors
        )
    )])
    
    fig.update_layout(
        title_text="Hierarchical Financial Transactions Flow",
        font_size=10,
        height=900,
        width=1400
    )
    
    return fig

sankey_fig = create_hierarchical_sankey(df)

# Show the diagram
sankey_fig.show()


In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
import numpy as np
import colorsys

def create_hybrid_sankey_choropleth(df):
    # Make a copy of the dataframe to avoid modifying the original
    df_copy = df.copy()
    
    # For the 'Mt' column, we'll use transaction counts instead of trying to sum the string values
    # We'll count how many transactions are sent/received by each region
    
    # Extract unique regions for the choropleth map
    send_regions = df_copy['send region'].unique()
    receiver_regions = df_copy['receiver region'].unique()
    all_regions = np.unique(np.concatenate([send_regions, receiver_regions]))
    
    # Create a figure with subplots: choropleth map on top, sankey diagram below
    fig = sp.make_subplots(
        rows=2, cols=1,
        row_heights=[0.4, 0.6],
        specs=[[{"type": "scattergeo"}], [{"type": "sankey"}]],
        vertical_spacing=0.03
    )
    
    # Create a dictionary to store aggregated transaction counts by region
    region_totals = {}
    for region in all_regions:
        # Count of outgoing transactions
        outgoing = df_copy[df_copy['send region'] == region].shape[0]
        # Count of incoming transactions
        incoming = df_copy[df_copy['receiver region'] == region].shape[0]
        # Net flow (positive means net receiver, negative means net sender)
        net_flow = incoming - outgoing
        region_totals[region] = {
            'outgoing': outgoing,
            'incoming': incoming,
            'net_flow': net_flow,
            'total_volume': outgoing + incoming
        }
    
    # Calculate region centroids (simplified - in a real implementation, you would use actual geographic coordinates)
    # This is a placeholder mapping regions to coordinates
    region_coords = {
        'North America': {'lat': 40, 'lon': -100},
        'South America': {'lat': -20, 'lon': -60},
        'Latin America': {'lat': -15, 'lon': -70},
        'Europe': {'lat': 50, 'lon': 10},
        'Africa': {'lat': 0, 'lon': 20},
        'Asia': {'lat': 30, 'lon': 100},
        'Asia Pacific': {'lat': 25, 'lon': 120},
        'Southeast Asia': {'lat': 10, 'lon': 110},
        'South Asia': {'lat': 20, 'lon': 80},
        'Central Asia': {'lat': 45, 'lon': 70},
        'Middle East': {'lat': 25, 'lon': 45},
        'Caribbean': {'lat': 20, 'lon': -75},
        'Oceania': {'lat': -25, 'lon': 135},
        'Global': {'lat': 0, 'lon': 0}  # Center of the map
    }
    
    # For the Mt values, we need to use categories rather than numeric values
    # Create a dictionary mapping Mt values to numeric indices for the Sankey diagram
    mt_values = df_copy['Mt'].unique()
    mt_mapping = {mt: i for i, mt in enumerate(mt_values)}
    
    # Sankey diagram categories
    categories = ['send region', 'sender', 'Mt', 'receiver', 'receiver region']
    
    mappings = {}
    offset = 0
    
    for category in categories:
        unique_values = df_copy[category].unique()
        mappings[category] = {value: i + offset for i, value in enumerate(unique_values)}
        offset += len(unique_values)
    
    all_nodes = []
    node_colors = []
    
    def generate_colors(n):
        colors = []
        for i in range(n):
            hue = i / n
            lightness = 0.6
            saturation = 0.5
            r, g, b = [int(255 * x) for x in colorsys.hls_to_rgb(hue, lightness, saturation)]
            colors.append(f'rgba({r},{g},{b},0.8)')
        return colors
    
    # Generate node labels and collect counts for each node
    node_counts = {}
    
    for category in categories:
        unique_values = df[category].unique()
        
        if category in ['send region', 'receiver region']:
            # Use the same colors as in the map for regions
            category_colors = []
            for value in unique_values:
                category_colors.append(region_colors.get(value, 'rgba(150,150,150,0.8)'))
        else:
            category_colors = generate_colors(len(unique_values))
        
        for i, value in enumerate(unique_values):
            node_label = f"{category}: {value}"
            all_nodes.append(node_label)
            node_colors.append(category_colors[i])
            
            # Count occurrences for each node
            if category == 'Mt':
                # For Mt (transaction amount), use the value itself
                count = value
            else:
                count = df[df[category] == value].shape[0]
            
            node_counts[node_label] = count
    
    node_labels = [f"{label.split(': ')[0]}: {label.split(': ')[1]}" for label in all_nodes]
    
    sources = []
    targets = []
    values = []
    link_colors = []
    
    # Create links between categories for the Sankey diagram
    for i in range(len(categories) - 1):
        source_category = categories[i]
        target_category = categories[i+1]
        
        # Group data and count occurrences
        grouped = df_copy.groupby([source_category, target_category]).size().reset_index(name='count')
            
        for _, row in grouped.iterrows():
            source_value = row[source_category]
            target_value = row[target_category]
            count = row['count']
            
            source_index = all_nodes.index(f"{source_category}: {source_value}")
            target_index = all_nodes.index(f"{target_category}: {target_value}")
            
            sources.append(source_index)
            targets.append(target_index)
            values.append(count)
            
            # Set link color based on source node
            link_colors.append(node_colors[source_index])
    
    # Add Sankey diagram
    fig.add_trace(
        go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="black", width=0.5),
                label=node_labels,
                color=node_colors
            ),
            link=dict(
                source=sources,
                target=targets,
                value=values,
                color=link_colors
            )
        ),
        row=2, col=1
    )
    
    # Update layout
    fig.update_layout(
        title_text="Financial Transactions Flow - Geographic Analysis",
        geo=dict(
            showland=True,
            landcolor='rgb(217, 217, 217)',
            coastlinecolor='rgb(120, 120, 120)',
            countrycolor='rgb(120, 120, 120)',
            showcountries=True,
            projection_type='natural earth',
            showocean=True,
            oceancolor='rgb(230, 230, 250)'
        ),
        height=1000,
        width=1200
    )
    
    # Update geo subplot
    fig.update_geos(
        resolution=50,
        showcoastlines=True,
        showland=True,
        showocean=True
    )
    
    return fig

# Example usage:
sankey_choropleth_fig = create_hybrid_sankey_choropleth(df)
sankey_choropleth_fig.show()

NameError: name 'df' is not defined

In [17]:
import pandas as pd
import random
import string

NUM_ROWS = 1000
NUM_UNIQUE_BANKS = 500  # To get 200+ unique SenderBc and ReceiverBC

country_codes = ['AE', 'JP', 'MW', 'US', 'CA', 'GB', 'DE', 'RU', 'BR', 'AU', 'ZA', 'AR', 'IN', 'SG', 'KR', 'MY', 'FR', 'ES', 'IT', 'MX']

country_region_map = {
    'US': 'North', 'CA': 'North', 'GB': 'North', 'DE': 'North', 'RU': 'North',
    'JP': 'North', 'CN': 'North', 'BR': 'South', 'AU': 'South', 'ZA': 'South', 'AR': 'South', 'MW': 'South',
    'AE': 'East', 'IN': 'East', 'SG': 'East', 'KR': 'East', 'MY': 'East',
    'FR': 'West', 'ES': 'West', 'IT': 'West', 'MX': 'West',
}

mx_mt_options = ['mx', 'mt']
categories = ['102', '103', '202', '203']
channels = ['mrb', 'rib', 'procashweb', 'procashmob', 'swf', 'fts', 'phub']

# To guarantee uniqueness of banks, generate a list of unique 4-letter codes
def generate_unique_banks(n):
    seen = set()
    banks = []
    while len(banks) < n:
        candidate = ''.join(random.choices(string.ascii_uppercase, k=4))
        if candidate not in seen:
            seen.add(candidate)
            banks.append(candidate)
    return banks

unique_bank_codes = generate_unique_banks(NUM_UNIQUE_BANKS)

# Function to generate BIC using unique bank code
def generate_bic_from_bank_code(bank_code, adcb=True):
    # ADCB format: 'ADCB' + bank_code + 'ADCB'
    # Non-ADCB format: 8 random letters + 3 letters
    if adcb:
        return 'ADCB' + bank_code + 'ADCB'
    else:
        return ''.join(random.choices(string.ascii_uppercase, k=8)) + ''.join(random.choices(string.ascii_uppercase, k=3))

def determine_direction(sender_bic, receiver_bic):
    if sender_bic.startswith('ADCB') and sender_bic[7:9] == 'CB':
        return 'internal'
    elif not receiver_bic.startswith('ADCB'):
        return 'outward'
    else:
        return 'inward'

def determine_currency(region_code):
    currency_map = {
        'US': 'USD', 'CA': 'CAD', 'GB': 'GBP', 'DE': 'EUR', 'RU': 'RUB',
        'JP': 'JPY', 'CN': 'CNY', 'BR': 'BRL', 'AU': 'AUD', 'ZA': 'ZAR',
        'AR': 'ARS', 'MW': 'MWK', 'AE': 'AED', 'IN': 'INR', 'SG': 'SGD',
        'KR': 'KRW', 'MY': 'MYR', 'FR': 'EUR', 'ES': 'EUR', 'IT': 'EUR',
        'MX': 'MXN'
    }
    return currency_map.get(region_code, 'USD')

data = []

for _ in range(NUM_ROWS):
    sender_is_adcb = random.random() < 0.7
    receiver_is_adcb = random.random() < 0.7
    
    # Pick unique bank codes for sender and receiver from the unique_bank_codes list
    sender_bank_code = random.choice(unique_bank_codes)
    receiver_bank_code = random.choice(unique_bank_codes)
    
    sender_bic = generate_bic_from_bank_code(sender_bank_code, adcb=sender_is_adcb)
    receiver_bic = generate_bic_from_bank_code(receiver_bank_code, adcb=receiver_is_adcb)
    
    sender_region = random.choice(country_codes)
    receiver_region = random.choice(country_codes)
    
    direction = determine_direction(sender_bic, receiver_bic)
    currency = determine_currency(sender_region)
    
    row = {
        "Sender": sender_bic,
        "Receiver": receiver_bic,
        "SenderRegion": sender_region,
        "ReceiverRegion": receiver_region,
        "SenderRegionGroup": country_region_map.get(sender_region, 'Unknown'),
        "ReceiverRegionGroup": country_region_map.get(receiver_region, 'Unknown'),
        "MX/MT": random.choice(mx_mt_options),
        "Category": random.choice(categories),
        "Year": random.randint(2015, 2024),
        "Currency": currency,
        "Direction": direction,
        "Amount": round(random.uniform(1000, 100000), 2),
        "SenderBc": sender_bic[4:8],
        "ReceiverBC": receiver_bic[4:8],
        "InitiatingChannel": random.choice(channels)
    }
    data.append(row)

df = pd.DataFrame(data)

# Check unique banks
print(f"Unique SenderBC count: {df['SenderBc'].nunique()}")
print(f"Unique ReceiverBC count: {df['ReceiverBC'].nunique()}")

df

Unique SenderBC count: 694
Unique ReceiverBC count: 664


Unnamed: 0,Sender,Receiver,SenderRegion,ReceiverRegion,SenderRegionGroup,ReceiverRegionGroup,MX/MT,Category,Year,Currency,Direction,Amount,SenderBc,ReceiverBC,InitiatingChannel
0,ADCBGIEKADCB,ADCBPHIXADCB,JP,IN,North,East,mx,102,2015,JPY,inward,35971.89,GIEK,PHIX,phub
1,ADCBWOWXADCB,ADCBXCNMADCB,MY,KR,East,East,mx,202,2020,MYR,inward,10499.53,WOWX,XCNM,mrb
2,ADCBZXZMADCB,ADCBPIGWADCB,DE,MW,North,South,mx,103,2018,EUR,inward,63003.71,ZXZM,PIGW,procashweb
3,ARABZOCBDQN,ADCBYKEIADCB,US,RU,North,North,mx,103,2024,USD,inward,70857.78,ZOCB,YKEI,procashmob
4,ADCBZFAHADCB,ADCBDULLADCB,IT,IN,West,East,mt,103,2021,EUR,inward,47609.54,ZFAH,DULL,mrb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,ADCBDOWLADCB,ADCBVDNFADCB,GB,ZA,North,South,mx,103,2022,GBP,inward,94702.08,DOWL,VDNF,swf
996,ADCBWNCZADCB,JBETOVCXNQL,AR,MY,South,East,mx,202,2016,ARS,outward,60014.89,WNCZ,OVCX,swf
997,ADCBXYSFADCB,ADCBVOPTADCB,MX,CA,West,North,mt,202,2022,MXN,inward,91831.35,XYSF,VOPT,mrb
998,ADCBHXELADCB,ADCBOHYNADCB,FR,AR,West,South,mt,202,2020,EUR,inward,48500.31,HXEL,OHYN,fts


In [18]:
df=df.drop(['Sender','Receiver'],axis=1)
df

Unnamed: 0,SenderRegion,ReceiverRegion,SenderRegionGroup,ReceiverRegionGroup,MX/MT,Category,Year,Currency,Direction,Amount,SenderBc,ReceiverBC,InitiatingChannel
0,JP,IN,North,East,mx,102,2015,JPY,inward,35971.89,GIEK,PHIX,phub
1,MY,KR,East,East,mx,202,2020,MYR,inward,10499.53,WOWX,XCNM,mrb
2,DE,MW,North,South,mx,103,2018,EUR,inward,63003.71,ZXZM,PIGW,procashweb
3,US,RU,North,North,mx,103,2024,USD,inward,70857.78,ZOCB,YKEI,procashmob
4,IT,IN,West,East,mt,103,2021,EUR,inward,47609.54,ZFAH,DULL,mrb
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,GB,ZA,North,South,mx,103,2022,GBP,inward,94702.08,DOWL,VDNF,swf
996,AR,MY,South,East,mx,202,2016,ARS,outward,60014.89,WNCZ,OVCX,swf
997,MX,CA,West,North,mt,202,2022,MXN,inward,91831.35,XYSF,VOPT,mrb
998,FR,AR,West,South,mt,202,2020,EUR,inward,48500.31,HXEL,OHYN,fts


In [19]:
import plotly.graph_objects as go

bins = [0, 10000, 50000, 100000, 200000]
labels = ['0-10k', '10k-50k', '50k-100k', '100k+']
df['AmountBin'] = pd.cut(df['Amount'], bins=bins, labels=labels, include_lowest=True)

# List of columns in order for Sankey flow
columns = [
    'SenderRegionGroup', 'ReceiverRegionGroup', 'SenderBc', 'ReceiverBC',
    'Currency', 'MX/MT', 'InitiatingChannel', 'Category', 'Year',
    'Direction', 'AmountBin'
]

# Build nodes
all_labels = []
label_indices = {}

def add_labels(vals, col_name):
    indices = []
    for v in vals:
        label = f"{col_name}: {v}"  # add column name as prefix
        if label not in label_indices:
            label_indices[label] = len(all_labels)
            all_labels.append(label)
        indices.append(label_indices[label])
    return indices


# Build links for each pair of consecutive columns
source = []
target = []
value = []

for i in range(len(columns)-1):
    col_source = columns[i]
    col_target = columns[i+1]
    
    grouped = df.groupby([col_source, col_target]).size().reset_index(name='count')
    
    source_indices = add_labels(grouped[col_source], col_source)
    target_indices = add_labels(grouped[col_target], col_target)
    
    source.extend(source_indices)
    target.extend(target_indices)
    value.extend(grouped['count'].tolist())


# Color coding: assign a color per node based on the column group (using Plotly qualitative palette)
import plotly.express as px
palette = px.colors.qualitative.Plotly  # 10 distinct colors

node_colors = []
color_map = {}
nodes_per_column = []

start_idx = 0
for col in columns:
    unique_vals = df[col].unique()
    nodes_per_column.append(len(unique_vals))
    for i, val in enumerate(unique_vals):
        # Assign color cycling through palette by column index
        color = palette[columns.index(col) % len(palette)]
        color_map[val] = color
        node_colors.append(color)
    start_idx += len(unique_vals)

# For links: color them based on their source node color
link_colors = []
for s in source:
    link_colors.append(node_colors[s])

# Create Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = all_labels,
        color = node_colors
    ),
    link = dict(
        source = source,
        target = target,
        value = value,
        color = link_colors
    )
)])

fig.update_layout(title_text="Payment Flow Sankey Diagram", font_size=10)
fig.show()





In [22]:
bins = [0, 10000, 50000, 100000, 200000]
labels = ['0-10k', '10k-50k', '50k-100k', '100k+']
df['AmountBin'] = pd.cut(df['Amount'], bins=bins, labels=labels, include_lowest=True)

# Efficient flow sequence
columns = [
    'SenderRegionGroup', 'ReceiverRegionGroup', 'Direction', 'MX/MT',
    'Currency', 'InitiatingChannel', 'Category', 'Year', 'SenderBc', 'ReceiverBC', 'AmountBin'
]

# Create nodes and links, prefix labels with column to avoid collapsing
all_labels = []
label_indices = {}

def add_labels(vals, col_name):
    indices = []
    for v in vals:
        label = f"{col_name}: {v}"
        if label not in label_indices:
            label_indices[label] = len(all_labels)
            all_labels.append(label)
        indices.append(label_indices[label])
    return indices

source = []
target = []
value = []

for i in range(len(columns) - 1):
    col_source = columns[i]
    col_target = columns[i + 1]

    grouped = df.groupby([col_source, col_target]).size().reset_index(name='count')

    source_indices = add_labels(grouped[col_source], col_source)
    target_indices = add_labels(grouped[col_target], col_target)

    source.extend(source_indices)
    target.extend(target_indices)
    value.extend(grouped['count'].tolist())

# Color nodes by column group using Plotly qualitative palette
palette = px.colors.qualitative.Plotly
node_colors = []
for label in all_labels:
    # Extract column name from label prefix
    col_name = label.split(':')[0]
    col_idx = columns.index(col_name)
    color = palette[col_idx % len(palette)]
    node_colors.append(color)

# Color links by source node color
link_colors = [node_colors[s] for s in source]

# Create Sankey figure
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_labels,
        color=node_colors,
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors,
    )
)])

fig.update_layout(title_text="Efficient Payment Flow Sankey Diagram", font_size=10)
fig.show()



