## <center>Generation of density charts for AF and COV</center>


| **Label** | **start time** | **finish time** | **last modified** |
|:--------------:|:-----------:|:-----------:|:----------------:|
|   Project 1   |  2023-07-10 |  2023-07-19 |   2023-07-19     |

In [1]:
import os
import pandas as pd
import numpy as np

import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
from scipy.stats import gaussian_kde
from plotly.offline import iplot, plot

# 1.For.Allele.Frequency

In [2]:
# 0.Import data
Folder_all = '/nfs/research/goldman/zihao/Datas/p1_errorsProject_NEW/Folder_dataProcessing/ANNOT_dataProcessing_allPos.txt'
Folder_err = "/nfs/research/goldman/zihao/Datas/p1_errorsProject_NEW/Folder_dataProcessing/ANNOT_dataProcessing_errorPos.txt"

# 1.For all positions
df_all = pd.read_csv(Folder_all, sep='\t')
df_all['AF'] = df_all['AF'].replace([np.inf, -np.inf], np.nan)  # Replacing infinity and infinitesimal values with NaN
df_all = df_all.dropna()  # Remove lines containing NaN
print(df_all.head())
print('The number of data contained： ', len(df_all))
print('====================================================================================')
print('====================================================================================')

# 2.For error positions
df_err = pd.read_csv(Folder_err, sep='\t')
df_err['AF'] = df_err['AF'].replace([np.inf, -np.inf], np.nan)  # Replacing infinity and infinitesimal values with NaN
df_err = df_err.dropna()  # Remove lines containing NaN
print(df_err.head())
print('The number of data contained： ', len(df_err))

In [6]:
# 3.Plotting
"""
hist_data = [(df_err[df_err['AF'] * 100 < 1]['AF'] * 100).tolist(), 
             (df_all[df_all['AF'] * 100 < 1]['AF'] * 100).tolist()]
"""
hist_data = [(df_err['AF'] * 100).tolist(), 
             (df_all['AF'] * 100).tolist()]

group_labels = ['Error Positions', 'All Positions'] # name of the dataset
colors = ['#835AF1', '#B8F7D4']

fig = make_subplots(rows=2, cols=1, row_heights=[0.75, 0.25], shared_xaxes=True)

# 1.Add Density Plot subplot
for data, label, color in zip(hist_data, group_labels, colors):
    kde = gaussian_kde(data)
    x_vals = np.linspace(min(data), max(data), 1000)
    y_vals = kde(x_vals)
    
    fig.add_trace(go.Scatter(x=x_vals, y=y_vals, mode='lines', fill='tozeroy', showlegend=False, line_color=color), 
                  row=1, col=1)

# 2.Add Box Plot subplots
"""
fig.add_trace(go.Box(x=df_all['AF'].tolist(), orientation='h', name=group_labels[1]), row=2, col=1)
fig.add_trace(go.Box(x=df_err['AF'].tolist(), orientation='h', name=group_labels[0]), row=2, col=1)
"""
fig.add_trace(go.Box(x=(df_all['AF'] * 100).tolist(), 
                     orientation='h', name=group_labels[1]), row=2, col=1)
fig.add_trace(go.Box(x=(df_err['AF'] * 100).tolist(),
                     orientation='h', name=group_labels[0]), row=2, col=1)

# 3.Setting up the graphic layout
fig.update_layout(
    title="Allele Frequency density map over sample positions",
    yaxis=dict(
        title='Density',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    xaxis=dict(
        title='Allele Frequency %',
        range=[0, 1],
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    legend=dict(
        traceorder="normal",
        font=dict(size=12),
    ),
    plot_bgcolor='white',
    yaxis_gridcolor='lightgray', yaxis_gridwidth=0.5,
    xaxis_gridcolor='lightgray', xaxis_gridwidth=0.5,
)

save_path = 'Figure/Plot_for_AF_1.png'
fig_path = os.path.abspath(save_path)

pio.write_image(fig, save_path)
print("AF's diagram has been saved at:", fig_path)

# 2.For.Coverage

In [4]:
# 0.Import data
Folder_all = '/nfs/research/goldman/zihao/Datas/p1_errorsProject_NEW/Folder_dataProcessing/COV_dataProcessing_allPos.txt'
Folder_err = '/nfs/research/goldman/zihao/Datas/p1_errorsProject_NEW/Folder_dataProcessing/COV_dataProcessing_errorPos.txt'
# 1.For all positions
df_all = pd.read_csv(Folder_all, sep='\t')
print(df_all.head())
print('The number of data contained： ', len(df_all))
print('====================================================================================')
print('====================================================================================')
# 2.For error positions
df_err = pd.read_csv(Folder_err, sep='\t')
print(df_err.head())
print('The number of data contained： ', len(df_err))

In [5]:
# 3.Plotting

hist_data = [df_err['COV_Ratio'].tolist(), df_all['Cov_RATIO'].tolist()]
group_labels = ['Error Positions', 'All Positions'] # name of the dataset
colors = ['#835AF1', '#B8F7D4']

fig = make_subplots(rows=2, cols=1, row_heights=[0.75, 0.25], shared_xaxes=True)

# 1.Add Density Plot subplot
for data, label, color in zip(hist_data, group_labels, colors):
    kde = gaussian_kde(data)
    x_vals = np.linspace(min(data), max(data), 1000)
    y_vals = kde(x_vals)
    # fig.add_trace(go.Scatter(x=x_vals, y=y_vals, mode='lines', fill='tozeroy', showlegend=True, name=label,line_color=color), row=1, col=1)
    fig.add_trace(go.Scatter(x=x_vals, y=y_vals, mode='lines', fill='tozeroy', showlegend=False, line_color=color), row=1, col=1)

# 2.Add Box Plot subplots
fig.add_trace(go.Box(x=df_all['Cov_RATIO'].tolist(), orientation='h', name=group_labels[1]), row=2, col=1)
fig.add_trace(go.Box(x=df_err['COV_Ratio'].tolist(), orientation='h', name=group_labels[0]), row=2, col=1)


fig.update_yaxes(range=[0, 2], row=1, col=1)
fig.update_xaxes(type='log', range=[np.log10(0.01), np.log10(np.max(np.concatenate(hist_data)))], row=1, col=1)
fig.update_xaxes(type='log', range=[np.log10(0.01), np.log10(np.max(np.concatenate(hist_data)))], row=2, col=1)

# 3.Setting up the graphic layout
fig.update_layout(
    title="Coverage density map over sample positions",
    yaxis=dict(
        title='Density',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    
    xaxis=dict(
        title='Ratio of coverage vs sample mean coverage',
        type='log',  # Setting the x-axis to a logarithmic scale
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        ),
    ),
    legend=dict(
        traceorder="normal",
        font=dict(size=12),
    ),

    plot_bgcolor='white',
    yaxis_gridcolor='lightgray', yaxis_gridwidth=0.5,
    xaxis_gridcolor='lightgray', xaxis_gridwidth=0.5,
)
save_path = 'Figure/Plot_for_COV.png'
fig_path = os.path.abspath(save_path)

pio.write_image(fig, save_path)
print("COV's diagram has been saved at:", fig_path)