In [1]:
import pandas as pd
from string import punctuation

In [2]:
data = pd.read_csv("../data/NER_entities.csv")

In [3]:
stats = pd.DataFrame()
stats["anotatedByDoctors"] = data["label"].value_counts()
print(len(data))
stats

6034


Unnamed: 0_level_0,anotatedByDoctors
label,Unnamed: 1_level_1
NE symptom,2021
procedura,1466
medikace,1115
symptom,794
osobní anamnéza,442
NE osobní anamnéza,153
NE medikace,43


In [4]:
stats["withoutDuplicates"] = data.drop_duplicates()["label"].value_counts()
print(len(data.drop_duplicates()))
stats

3446


Unnamed: 0_level_0,anotatedByDoctors,withoutDuplicates
label,Unnamed: 1_level_1,Unnamed: 2_level_1
NE symptom,2021,1342
procedura,1466,699
medikace,1115,388
symptom,794,661
osobní anamnéza,442,246
NE osobní anamnéza,153,87
NE medikace,43,23


In [5]:
def clean(string):
    string = string.strip(" " + "".join(punctuation))
    if len(string) >= 2:
        string = string[0].lower() + string[1:] if string[1].islower() else string
    return " ".join(string.split())

def clean_table(db):
    db_copy = db.copy()
    db_copy["text"] = db_copy["text"].apply(clean)
    db_copy["text"] = db_copy["text"].drop_duplicates()
    return db_copy.dropna()

In [6]:
cleaned_table = clean_table(data)

stats["cleanedWithoutDuplicates"] = cleaned_table["label"].value_counts()
print(len(cleaned_table))
stats

2803


Unnamed: 0_level_0,anotatedByDoctors,withoutDuplicates,cleanedWithoutDuplicates
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NE symptom,2021,1342,1024
procedura,1466,699,596
medikace,1115,388,305
symptom,794,661,587
osobní anamnéza,442,246,214
NE osobní anamnéza,153,87,61
NE medikace,43,23,16


In [7]:
stats["anotatedByDoctors"] = stats["anotatedByDoctors"].astype("int64")
stats["withoutDuplicates"] = stats["withoutDuplicates"].astype("int64")
stats["cleanedWithoutDuplicates"] = stats["cleanedWithoutDuplicates"].astype("int64")

stats.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, NE symptom to NE medikace
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   anotatedByDoctors         7 non-null      int64
 1   withoutDuplicates         7 non-null      int64
 2   cleanedWithoutDuplicates  7 non-null      int64
dtypes: int64(3)
memory usage: 224.0+ bytes


In [8]:
stats.loc[["procedura", "medikace", "symptom",
           "osobní anamnéza", "NE medikace", "NE symptom",
           "NE osobní anamnéza"]].to_csv("../csv_output/records_stats.csv", index_label="category")

In [9]:
cleaned_table["text_len"] = cleaned_table["text"].apply(lambda x: len(x.split(" ")))

In [10]:
print(round(cleaned_table["text_len"].mean(), 2))
print(int(cleaned_table["text_len"].median()))
print(cleaned_table["text_len"].max())

2.99
3
18


In [11]:
stats_agg = pd.DataFrame()
stats_agg["mean"] = cleaned_table[["label", "text_len"]].groupby(by="label").mean().apply(lambda x: round(x, 2))
stats_agg["median"] = cleaned_table[["label", "text_len"]].groupby(by="label").median().astype("int64")
stats_agg["max"] = cleaned_table[["label", "text_len"]].groupby(by="label").max()
stats_agg

Unnamed: 0_level_0,mean,median,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NE medikace,2.06,2,4
NE osobní anamnéza,2.56,2,5
NE symptom,3.29,3,18
medikace,1.66,1,10
osobní anamnéza,2.87,3,9
procedura,2.76,2,14
symptom,3.5,3,14


In [12]:
stats_agg.loc[["procedura", "medikace", "symptom",
           "osobní anamnéza", "NE medikace", "NE symptom",
           "NE osobní anamnéza"]].to_csv("../csv_output/records_agg_stats.csv", index_label="category")

In [13]:
cleaned_table["num_abber"] = cleaned_table["text"].apply(lambda x: sum(1 for w in x.split(" ") if w.isupper()))
cleaned_table[(cleaned_table["num_abber"] != 0) & (cleaned_table["label"] != "medikace") & (cleaned_table["label"] != "NE medikace")]

Unnamed: 0,label,text,text_len,num_abber
1,procedura,neoadjuvantní CHT,2,1
3,symptom,označena SLU v levé axile,5,1
4,procedura,st.p. totální ME + SNB vlevo,6,2
6,procedura,založení TE l.sin,3,1
12,NE symptom,KI 100,2,1
...,...,...,...,...
5954,NE symptom,axila bez LAP,3,1
5955,symptom,vpravo prominující tu léze v rozhraní HKK,7,1
5985,symptom,solit drobné hypoechogení ložisko v S3 jater,7,1
6017,symptom,hypovit D,2,1


In [14]:
586 / len(cleaned_table)

0.20906171958615768

In [15]:
(cleaned_table[(cleaned_table["label"] != "medikace") & (cleaned_table["label"] != "NE medikace")]
             ["num_abber"].sum() /
 cleaned_table["text_len"].sum() * 100)

7.996180928511755

In [16]:
cleaned_table["num_shortcuts"] = cleaned_table["text"].apply(lambda x: sum(1 for w in x if "." == w))
cleaned_table[cleaned_table["num_shortcuts"] != 0]

Unnamed: 0,label,text,text_len,num_abber,num_shortcuts
4,procedura,st.p. totální ME + SNB vlevo,6,2,2
6,procedura,založení TE l.sin,3,1,1
18,symptom,mamma l.sin. dysplazie,3,0,2
22,osobní anamnéza,plastikou umb. kýly,3,0,1
25,NE symptom,dif bez palp.citl,3,0,1
...,...,...,...,...,...
5917,symptom,"hranič. zvýš. ALT, AST",4,2,2
5926,osobní anamnéza,st.p. HYE s AE bilat,5,2,2
5930,NE symptom,"pacientka KP komp., schopna výkonu v CA",7,2,1
5973,NE symptom,t.č. bez známek perit. dráždění,5,0,3


In [17]:
431 / len(cleaned_table) * 100

15.376382447377809

In [18]:
cleaned_table["num_shortcuts"].sum() / cleaned_table["text_len"].sum() * 100

7.1726936388590525

In [19]:
cleaned_table["short_total"] = cleaned_table["num_abber"] + cleaned_table["num_shortcuts"]

cleaned_table["short_total"] = cleaned_table.apply(lambda x: x.short_total if x.label != "medikace" and x.label != "NE medikace" else x.num_shortcuts, axis=1)
cleaned_table[cleaned_table["short_total"] != 0]

Unnamed: 0,label,text,text_len,num_abber,num_shortcuts,short_total
1,procedura,neoadjuvantní CHT,2,1,0,1
3,symptom,označena SLU v levé axile,5,1,0,1
4,procedura,st.p. totální ME + SNB vlevo,6,2,2,4
6,procedura,založení TE l.sin,3,1,1,2
12,NE symptom,KI 100,2,1,0,1
...,...,...,...,...,...,...
5973,NE symptom,t.č. bez známek perit. dráždění,5,0,3,3
5985,symptom,solit drobné hypoechogení ložisko v S3 jater,7,1,0,1
6008,symptom,občasně boelsti v obl. levého prsu,6,0,1,1
6017,symptom,hypovit D,2,1,0,1


In [20]:
904 / len(cleaned_table) * 100

32.25115947199429

In [21]:
cleaned_table["short_total"].sum() / cleaned_table["text_len"].sum() * 100

15.168874567370807