In [48]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import pandas as pd
import numpy as np

In [59]:
# 读取数据
df = pd.read_csv("output.txt", sep='\t')

# 分别计算 DP、AF、SB 的平均值
columns = ["AF", "SB"]
results = {}
for i, col in enumerate(columns):
    df_col = df.dropna(subset=[col])[["Position", col]]
    df_col[col] = df_col[col].astype(float)
    result = df_col.groupby(by=["Position"], as_index=False).max(numeric_only=True)
    result = result[result[col] != 0]  # 只保留非零值
    results[f"df_err_v{i+1}"] = result

# 返回结果
df_err_v1 = results["df_err_v1"]
df_err_v2 = results["df_err_v2"]

```python
# 读取数据
df = pd.read_csv("output.txt", sep='\t')

# 分别计算 DP、AF、SB 的平均值
columns = ["AF", "SB"]
results = {}
for i, col in enumerate(columns):
    df_col = df.dropna(subset=[col])[["Position", col]]
    df_col[col] = df_col[col].astype(float)
    df_col = df_col[df_col[col] != 0]  # 只保留非零值
    result = df_col.groupby(by=["Position"], as_index=False).mean(numeric_only=True)
    results[f"df_err_v{i+1}"] = result

# 返回结果
df_err_v1 = results["df_err_v1"]
df_err_v2 = results["df_err_v2"]
```

In [60]:
df_err_v1

Unnamed: 0,Position,AF
23,25,0.125
26,28,0.012959
33,35,0.000804
67,3218,0.015274
71,3731,0.002933
106,8651,0.001396
214,22502,0.000331
227,22995,0.018018
236,23604,0.031197
263,25317,0.006368


In [33]:
df_err_v1['AF'].unique()

array([0.125     , 0.012959  , 0.000804  , 0.015274  , 0.002933  ,
       0.001396  , 0.000331  , 0.018018  , 0.01145133, 0.006368  ,
       0.0019772 , 0.000972  , 0.02451   , 0.061293  ])

In [8]:
df_err_v2

Unnamed: 0,Position,SB
0,3218,1.0
1,3731,65.0
2,8651,314.0
3,22502,2.0
4,25317,10.0
5,27800,9.0
6,28083,21.0
7,29867,1.0


In [62]:
# from plotly.offline import iplot, plot

hist_data = [(df_err_v1['AF'] * 100).tolist()]
group_labels = ['error position'] # name of the dataset
colors = ['#7FA6EE']

fig = ff.create_distplot(hist_data, group_labels, curve_type='normal', colors=colors)

# 设置图形布局
fig.update_layout(
    title="Allele Frequency",
    yaxis=dict(
        title='Probability density',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    
    xaxis=dict(
        title='Allele Frequency',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    legend=dict(
        traceorder="normal",
        font=dict(size=12),
    ),

    plot_bgcolor='white',
    yaxis_gridcolor='lightgray', yaxis_gridwidth=0.5,
    xaxis_gridcolor='lightgray', xaxis_gridwidth=0.5,
)
fig.show()
# plot(fig, filename='my_plot.html')

In [45]:
df_all_1 = pd.read_csv('AF_RATIO.txt', sep='\t')
df_all_2 = pd.read_csv('SB_RATIO.txt', sep='\t')

In [66]:
df_all_1 = df_all_1[df_all_1['ERROR_all'] != 0]  # 只保留非零值

In [68]:
from plotly.offline import iplot, plot

hist_data = [(df_err_v1['AF'] * 100).tolist(), (df_all_1['ERROR_all'] * 100).tolist()]
group_labels = ['error position', 'all position'] # name of the dataset
colors = ['#7FA6EE', '#B8F7D4']

fig = ff.create_distplot(hist_data, group_labels, curve_type='normal', colors=colors) # , bin_size=.5

# 设置图形布局
fig.update_layout(
    title="Allele Frequency",
    yaxis=dict(
        title='Probability density',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    
    xaxis=dict(
        title='Allele Frequency',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    legend=dict(
        traceorder="normal",
        font=dict(size=12),
    ),

    plot_bgcolor='white',
    yaxis_gridcolor='lightgray', yaxis_gridwidth=0.5,
    xaxis_gridcolor='lightgray', xaxis_gridwidth=0.5,
)
# fig.show()
plot(fig, filename='my_plot.html')

'my_plot.html'

In [46]:
# from plotly.offline import iplot, plot

hist_data = [df_err_v2['SB'].tolist(), df_all_2['ERROR_all'].tolist()]
group_labels = ['error position', 'all position'] # name of the dataset
colors = ['#7FA6EE', '#B8F7D4']

fig = ff.create_distplot(hist_data, group_labels, curve_type='normal', colors=colors, bin_size=.5)

# 设置图形布局
fig.update_layout(
    title="Phred-scaled strand bias at this position",
    yaxis=dict(
        title='Frequency',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    
    xaxis=dict(
        title='The ratio of coverage vs mean coverage',
        showline=True, showgrid=False,
        linewidth=2, linecolor='gray', ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='black',
        )
    ),
    legend=dict(
        traceorder="normal",
        font=dict(size=12),
    ),

    plot_bgcolor='white',
    yaxis_gridcolor='lightgray', yaxis_gridwidth=0.5,
    xaxis_gridcolor='lightgray', xaxis_gridwidth=0.5,
)
fig.show()
# plot(fig, filename='my_plot.html')

### rug plot：
Showing the original discrete distribution of the data

In [47]:
import gc
gc.collect()

1570