In [None]:
import seaborn as sns
import pandas as pd

import matplotlib.pyplot as plt
from ukbb_recessive.regression.regressions import read_results_excel
import ukbb_recessive.regression.plotting as plotting
from matplotlib.cm import get_cmap
import numpy as np

sns.set_style("whitegrid")

# Add fonts
plotting.add_fonts(['../../../data/fonts'])
plotting.configure_matplotlib()

plt.rc('font', size=plotting.SMALL_SIZE, family='Arimo') # controls default text sizes

# Prepare datasets

## CR data

In [None]:
# select analyses
renaming_dict_panel = {
	'Derm': 'Dermatologic',
	'ID-total': 'ID',
	'Immune_system': 'Immune system',
	'Overlaps': 'Multi-system',
	'Skeletal+Craniofacial': 'Skeletal'
}

In [None]:
cr = pd.read_csv("../../../data/tables/CR.csv", sep="\t")
cr['panel'] = cr['panel'].apply(lambda x: renaming_dict_panel.get(x, x))

cr['rel'] = cr['affected_offsprings_first_cousins'] / (cr['affected_offsprings'] + 2.1)

cr = cr.sort_values('rel', ascending=False)

In [None]:
print ("Non-ID CR:", cr[cr['panel'] != 'ID']['at_risk_couples_first_cousins'].sum()/cr[cr['panel'] != 'ID']['at_risk_couples'].sum())

In [None]:
print ("ID CR:", cr[cr['panel'] == 'ID']['at_risk_couples_first_cousins'].sum()/cr[cr['panel'] == 'ID']['at_risk_couples'].sum())

## All PLPs

In [None]:
def is_lof(consequence_terms):
   if (("splice_acceptor_variant" in consequence_terms) | ("splice_donor_variant" in consequence_terms) | 
       ("stop_gained" in consequence_terms) | ("frameshift_variant" in consequence_terms)):
      return True
   
   return False

# def add_hets_bin(plps, col='hets', out_col='hets_bin', n_bins=4):

#     plps[out_col] = pd.qcut(plps[col], 4)

#     mybinlabels = {i: f'{int(i.left)+1}-{int(i.right)}' for i in plps.hets_bin.cat.categories}
#     last = plps.hets_bin.cat.categories[-1]
#     mybinlabels[last] = f'> {int(last.left)}'
#     plps[out_col] = plps[out_col].cat.rename_categories(mybinlabels)

#     return plps

def add_AC_bin(AC):
    if AC == 1: return '1'
    if AC == 2: return '2'
    if AC <= 5: return '3-5'
    else: return '> 5'

#gene panel
gene_panel = pd.read_csv(".../450k/regions/gene-panel-gencode-v34.txt", header=None)
gene_panel.columns = ['gene', 'panel']

gene_panel.loc[gene_panel['panel'] != 'ID-total', 'panel'] = 'non-ID'

In [None]:
plps = pd.read_csv(".../450k/plp_selection/basic/new_gene_names/new_freq/new_relatedness/all_chr_total_presumable_plps_HFE_final_sorted.txt", sep='\t')
print (plps.shape)

plps = plps.merge(gene_panel, on='gene', how='inner')

# plps = plps[plps['homs']==0]

# # # add hets bin
# plps = add_hets_bin(plps, n_bins=2)

plps['AC'] = plps['hets'] + 2*plps['homs']
plps['AC_bin'] = plps['AC'].apply(add_AC_bin)

# Plot

In [None]:
def print_vals(pct, allvals):
    absolute = int(np.round(pct/100.*np.sum(allvals)))
    return f"{absolute:d}"

def plot_pie(subfig):
    subfigs = subfig.subfigures(nrows=2, ncols=1, height_ratios=[0.95, 0.1], wspace=0.01)

    ax = subfigs[0].subplots(1, 2, gridspec_kw={'width_ratios': [1, 1]})

    colours = ["#176d8f", "#49829f", "#6d97af", "#8fadbf", "#afc3cf", "#d0d9e0",
               "#f1f1f1", "#f1d4d4", "#f0b8b8", "#ec9c9d", "#e67f83", "#de6069", "#d43d51"][::-1]
    
    id_colors = colours[:1] + ['white']*(cr.shape[0]-1)

    ax[0].pie(cr['affected_offsprings'], radius=1.1, colors=id_colors, counterclock=False, startangle=90,
              wedgeprops=dict(width=0.4, edgecolor='w', linewidth=0.2))

    patches, texts, _ = ax[0].pie(cr['affected_offsprings'].values, radius=1, colors=colours, counterclock=False, startangle=90,
                            wedgeprops=dict(width=0.4, edgecolor='w', linewidth=0.2), 
                            autopct=lambda x: print_vals(x, cr['affected_offsprings'].values), pctdistance=0.85)

    ax[0].set_title("Non-consanguineous")


    ax[1].pie(cr['affected_offsprings_first_cousins'], radius=1.1, colors=id_colors, counterclock=False, startangle=90,
                        wedgeprops=dict(width=0.4, edgecolor='w', linewidth=0.2))

    ax[1].pie(cr['affected_offsprings_first_cousins'].values, radius=1, colors=colours,  counterclock=False, startangle=90,
        wedgeprops=dict(width=0.4, edgecolor='w', linewidth=0.2), 
        autopct=lambda x: print_vals(x, cr['affected_offsprings_first_cousins'].values), pctdistance=0.85)
    
    ax[1].set_title("First cousins")

    ax = subfigs[1].subplots(1, 1)

    legend_kwargs = {
        "bbox_to_anchor": (0, 0.2, 1, .102), 
        "frameon": False,
        "mode": "expand", 
        "ncol": 3, 
        "labelspacing": 0.1, 
        "markerfirst": False, 
        # "legend_loc": 'upper left', 
        'fontsize': plotting.SMALL_SIZE
    }

    ax.legend(patches, cr['panel'].values, loc="lower left", **legend_kwargs)
    ax.axis('off')

In [None]:
import matplotlib.ticker as ticker

cm = 1/2.54  # centimeters in inches
k = 1.
fig = plt.figure(constrained_layout=True, figsize=(8.8*cm*k, 3*cm*k))

subfigs = fig.subfigures(nrows=1, ncols=2, width_ratios=[1,0.5],  hspace=0.01)

# plot_pie(subfigs[0])


ax = subfigs[1].subplots(1, 1)


# sns.histplot(ax=ax, data=plps,  x='gnomAD_bin', palette='viridis', linewidth=0.5, hue='panel',
#               multiple='dodge', shrink=0.8, binwidth=1, stat='percent', discrete=True, common_norm=False)

colors = [ '#d43d51', '#094074',]

sns.histplot(ax=ax, data=plps.sort_values(by=['panel', 'AC']),  x='AC_bin', palette=colors, linewidth=0.5, hue='panel',
              multiple='dodge', shrink=0.8, binwidth=1, stat='percent', discrete=True, common_norm=False)
# plt.xticks(rotation=45)
plotting.configure_axis(ax, ylim=[0, 50],)
# plotting.configure_axis(ax, ylim=[0, 80],)

plt.savefig("../../../data/plots/figure_6.pdf", format="pdf", bbox_inches="tight")

In [None]:
plps[['panel']].value_counts()

In [None]:
from scipy.stats import chi2_contingency

def chi2_test(plps, hets_col='hets_bin', hets_bin='1-1'):
    non_id_data = plps.loc[plps['panel'] != 'ID-total', hets_col].dropna().values
    success_non_id, nobs_non_id = np.sum(non_id_data == hets_bin), len(non_id_data)
    print ("Non-id:", success_non_id, nobs_non_id, success_non_id/nobs_non_id*100)

    id_data = plps.loc[plps['panel'] == 'ID-total', hets_col].dropna().values
    success_id, nobs_id = np.sum(id_data == hets_bin), len(id_data)
    print ("ID:", success_id, nobs_id, success_id/nobs_id*100)
    print()

    # Assuming success1 and success2 are the number of successes in each group
    # and nobs1 and nobs2 are the total number of observations in each group
    contingency_table = np.array([[success_non_id, nobs_non_id - success_non_id], [success_id, nobs_id - success_id]])

    print (contingency_table)
    print()
    
    chi2, p_value, dof, _ = chi2_contingency(contingency_table)
    print("Chi-square statistic:", chi2)
    print("P-value:", p_value)
    print("Degrees of freedom:", dof)

In [None]:
chi2_test(plps, 'AC_bin', '1')

In [None]:
chi2_test(plps, 'AC_bin', '> 5')

In [None]:
import pandas as pd
import plotly.graph_objects as go

def create_sankey(df):
    
    # Create a list of unique channels from both 'before' and 'after' columns
    unique_channels = df['panel'].tolist()
    
    # Create a mapping of channel names to node indices
    channel_indices = {channel: i for i, channel in enumerate(unique_channels)}
    
    # Prepare the data for the Sankey diagram
    sources = []
    targets = []
    values = []
    
    for _, row in df.iterrows():
        source_index = channel_indices[row['panel']]
        target_index = source_index  # Since channel names are the same, indices will be the same
        value_before = row['affected_offsprings']
        value_after = row['affected_offsprings_first_cousins']
        
        # From "before" to "after"
        sources.append(source_index)
        targets.append(target_index)
        values.append(value_before)
        
        # From "after" to "before" (to show the transition)
        sources.append(target_index)
        targets.append(source_index)
        values.append(value_after)
    
    # Create the Sankey diagram
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=unique_channels,
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    ))

    # Update layout and display the figure
    fig.update_layout(title_text="Sankey Diagram of Clients per Channel", font_size=10)
    fig.show()

create_sankey(cr)

In [None]:
import pandas as pd
import plotly.graph_objects as go

def create_sankey(df):
    # Ensure that the DataFrame has the required columns
    if not all(col in df.columns for col in ['panel', 'affected_offsprings', 'affected_offsprings_first_cousins']):
        raise ValueError("DataFrame must contain 'panel', 'affected_offsprings', and 'affected_offsprings_first_cousins' columns")
    
    # Create a list of unique channels
    unique_channels = list(set(df['panel'].tolist() + df['affected_offsprings'].tolist() + df['affected_offsprings_first_cousins'].tolist()))
    
    # Create a mapping of channel names to node indices
    channel_indices = {channel: i for i, channel in enumerate(unique_channels)}
    
    # Prepare the data for the Sankey diagram
    sources = []
    targets = []
    values = []
    
    for _, row in df.iterrows():
        source_index = channel_indices[row['affected_offsprings']]
        target_index = channel_indices[row['affected_offsprings_first_cousins']]
        value = row['panel']
        
        sources.append(source_index)
        targets.append(target_index)
        values.append(value)
    
    # Create the Sankey diagram
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=unique_channels,
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values
        )
    ))

    # Update layout and display the figure
    fig.update_layout(title_text="Sankey Diagram of Clients per Channel", font_size=10)
    fig.show()

create_sankey(cr)

In [None]:
create_sankey(cr)

In [None]:
import plotly.graph_objects as go

# Define the nodes (categories) and the links (flows between them)
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,  # Padding between nodes
        thickness=20,  # Thickness of the nodes
        line=dict(color="black", width=0.5),  # Border color and width of nodes
        label=["Dimension A (Before)", "Dimension B (Before)", "Dimension C (Before)",
               "Dimension A (After)", "Dimension B (After)", "Dimension C (After)"],  # Labels for nodes
        color=["#008080", "#FF6347", "#2F4F4F",  # Custom colors for nodes
               "#008080", "#FF6347", "#2F4F4F"]  # Same colors for after nodes
    ),
    link=dict(
        source=[0, 1, 2],  # Indices of the source nodes
        target=[3, 4, 5],  # Indices of the target nodes
        value=[0.47, 0.16, 0.37],  # Thickness of the links
        color=["#008080", "#FF6347", "#2F4F4F"],  # Colors of the links matching the source nodes
        hoverlabel=dict(bgcolor="white")  # White background for the hover labels
    )
))

# Set the title and layout
fig.update_layout(
    title_text="Proportion Flow: Before vs. After",
    font=dict(size=14),
    plot_bgcolor='white',  # Set background to white
    paper_bgcolor='white'
)

# Show the plot
fig.show()


In [None]:
import plotly.graph_objects as go

# Example data for 12 categories
categories_before = [f"{panel} {round(val, 2)}," for panel, val in zip(cr['panel'].tolist(), cr['affected_offsprings'].tolist())]
categories_after = [f"{panel} {round(val, 2)}" for panel, val in zip(cr['panel'].tolist(), cr['affected_offsprings_first_cousins'].tolist())]

values_before = cr['affected_offsprings'].values
values_after = cr['affected_offsprings_first_cousins'].values

values_before = (values_before/values_before.sum()*100).tolist()
values_after = (values_after/values_after.sum()*100).tolist()


# Define the nodes
node_labels = categories_before + ["Intermediate Node"] + categories_after

# Index of intermediate node
intermediate_node_idx = len(categories_before)

# Sources: all categories connect to the intermediate node
sources = list(range(len(categories_before)))

# Targets: intermediate node to all "After" categories
targets = [intermediate_node_idx] * len(categories_before) + list(range(intermediate_node_idx + 1, intermediate_node_idx + 1 + len(categories_after)))

# Values for the connections
link_values = values_before + values_after  # Connecting before to intermediate, and intermediate to after

# Create the Sankey plot
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=node_labels,
        color="blue"
    ),
    link=dict(
        source=sources + [intermediate_node_idx] * len(categories_after),  # Connect before to intermediate, and intermediate to after
        target=[intermediate_node_idx] * len(categories_before) + list(range(intermediate_node_idx + 1, intermediate_node_idx + 1 + len(categories_after))),
        value=link_values,
        color=["rgba(44, 160, 44, 0.6)" if i < len(values_before) else "rgba(31, 119, 180, 0.6)" for i in range(len(link_values))]  # Different colors for before->intermediate and intermediate->after
    )
))

# Update the layout
fig.update_layout(
    title_text="Sankey Diagram: Before to After with Intermediate Node",
    font_size=12,
    plot_bgcolor='white',
    paper_bgcolor='white'
)

# Show the plot
fig.write_image("sankey_diagram.pdf")


In [None]:
cr

In [None]:
targets