In [14]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

In [7]:
df = pd.read_csv('ner_norm_failure_20231017.csv', sep='\t').drop(labels='Unnamed: 0', axis=1)
df[0:5]

Unnamed: 0,end,entity_group,score,start,word,match_score,concept_id,match_type
0,54,GENETIC,0.99999,37,HMG - CoA reductase,,,0
1,107,GENETIC,0.999983,100,OATP1B1,,normalize.gene:OATP1B1,60
2,127,GENETIC,0.527788,118,transport,,,0
3,318,GENETIC,0.999986,311,OATP1B1,,normalize.gene:OATP1B1,60
4,335,GENETIC,0.998929,321,SLCO1B1 521T > C,,,0


In [10]:
df['entity_group'].value_counts()

entity_group
GENETIC     2300
CHEMICAL    1625
DISEASE      231
Name: count, dtype: int64

## Graph

In [16]:
df['match_category'] = df['match_type'].apply(lambda x: 'Not Normalized' if x == 0 else 'Normalized')
df[0:10]

Unnamed: 0,end,entity_group,score,start,word,match_score,concept_id,match_type,match_category
0,54,GENETIC,0.99999,37,HMG - CoA reductase,,,0,Not Normalized
1,107,GENETIC,0.999983,100,OATP1B1,,normalize.gene:OATP1B1,60,Normalized
2,127,GENETIC,0.527788,118,transport,,,0,Not Normalized
3,318,GENETIC,0.999986,311,OATP1B1,,normalize.gene:OATP1B1,60,Normalized
4,335,GENETIC,0.998929,321,SLCO1B1 521T > C,,,0,Not Normalized
5,391,GENETIC,0.998284,376,SLCO1B1 521 C / C,,,0,Not Normalized
6,44,CHEMICAL,0.999914,37,HMG - CoA,,,0,Not Normalized
7,78,CHEMICAL,0.999996,77,r,,rxcui:1091,60,Normalized
8,89,CHEMICAL,0.987065,78,##osuvastatin,,,0,Not Normalized
9,173,CHEMICAL,0.999998,172,r,,rxcui:1091,60,Normalized


In [17]:
grouped = df.groupby(['entity_group', 'match_category']).size().unstack(fill_value=0).reset_index()
grouped[0:10]

match_category,entity_group,Normalized,Not Normalized
0,CHEMICAL,966,659
1,DISEASE,115,116
2,GENETIC,1129,1171


In [26]:
fig = go.Figure()

colors = ['#5e3c99', '#b2abd2'] 

for idx, category in enumerate(['Normalized', 'Not Normalized']):
    fig.add_trace(go.Bar(
        x=grouped['entity_group'],
        y=grouped[category],
        name=category,
        marker_color=colors[idx]
    ))

fig.update_layout(
    barmode='stack',
    title='Total Number of Extracted Entities by Type and Normalization Status',
    xaxis_title='Entity Group',
    yaxis_title='Count',
    legend_title='Match Type Category',
    title_font=dict(size=24),
    font=dict(size=18),  # Adjusts tick labels, legend, etc.
    xaxis=dict(title_font=dict(size=20), tickfont=dict(size=16)),
    yaxis=dict(title_font=dict(size=20), tickfont=dict(size=16)),
    legend=dict(font=dict(size=16)),
)

fig.show()

In [27]:
import plotly.io as pio

# Save the figure as a high-quality image
image_path = "Entity_Group_Bar_Chart.png"
pio.write_image(fig, image_path, format='png', scale=2, width=1200, height=900)


In [35]:
# Stats
grouped['percentage_norm'] = (grouped['Normalized'] / (grouped['Normalized']+grouped['Not Normalized'])) * 100
grouped['percentage_notnorm'] = (grouped['Not Normalized'] / (grouped['Normalized']+grouped['Not Normalized'])) * 100

In [36]:
grouped

match_category,entity_group,Normalized,Not Normalized,percentage_norm,percentage_notnorm
0,CHEMICAL,966,659,59.446154,40.553846
1,DISEASE,115,116,49.78355,50.21645
2,GENETIC,1129,1171,49.086957,50.913043
