In [1]:
import sys
from importlib import reload

import polars as pl
from datatools import plotting as dtplot
from datatools import tabular as dttab

sys.path.append("..")

import plotting
import util
from src import text_process

reload(util)
reload(plotting)
reload(text_process)
dtplot.set_plotly_template()

In [2]:
examples = util.load_examples()
examples.head(5)

name,lang,difficulty,tokens,tags,length
str,str,str,list[str],list[str],u32
"""shrt_pseudo""","""pseudo""","""easy""","[""x"", ""="", ""1""]","[""va"", ""opas"", ""nu""]",3
"""smplrr_json""","""json""","""easy""","[""["", ""2"", … ""]""]","[""brop"", ""nu"", … ""brcl""]",9
"""hllwrld_natural""","""natural""","""normal""","[""say"", "" "", """"Hello world""""]","[""kwio"", ""ws"", ""st""]",3
"""hllwrld_ruby""","""ruby""","""normal""","[""puts"", "" "", """"Hello World""""]","[""kwio"", ""ws"", ""st""]",3
"""smplndnttb_pseudo""","""pseudo""","""normal""","[""if"", "" "", … "")""]","[""kwfl"", ""ws"", … ""brcl""]",8


## tokens


In [3]:
token_counts = dttab.value_counts(examples["tokens"].explode())
token_counts

tokens,tokens_count
str,u32
""" """,887
""" """,270
""",""",224
""")""",221
"""(""",221
…,…
"""""$x + $y = $z\n""""",1
"""""$a, $b""""",1
"""""""""",1
""" """,1


## which tags can each token have?


In [4]:
token_to_tags = (
    examples.select(pl.col("tokens", "tags").explode())
    .group_by("tokens")
    .agg(pl.col("tags").explode().unique())  # what tags can the token have?
).join(token_counts, on="tokens")

single_tagged = (
    token_to_tags.filter(pl.col("tags").list.len() == 1)
    .sort("tokens_count", descending=True)
    .with_columns(pl.col("tags").list[0])
    .filter(pl.col("tags").is_in(text_process.DET_TAGS).not_())
)  # exclude DET TAGS

multi_tagged = token_to_tags.filter(pl.col("tags").list.len() != 1).sort(
    "tokens_count", descending=True
)
print(f"found {len(single_tagged)} tokens with a single tag")
print(f"found {len(multi_tagged)} tokens with multiple tags")

display(single_tagged.head(10))
# display(multi_tagged.head(10))


found 548 tokens with a single tag
found 39 tokens with multiple tags


tokens,tags,tokens_count
str,str,u32
"""=""","""opas""",179
""".""","""sy""",129
""":""","""sy""",48
"""import""","""kwim""",25
"""i""","""va""",22
"""for""","""kwfl""",18
"""np""","""mo""",17
"""==""","""opcm""",17
"""if""","""kwfl""",16
"""return""","""kwfl""",15


## which tokens can each tag have?


In [5]:
tag_counts = dttab.value_counts(examples["tags"].explode())

tags_to_tokens = (
    (
        examples.select(pl.col("tokens", "tags").explode())
        .group_by("tags")
        .agg(pl.col("tokens").explode().unique())  # what tags can the token have?
    )
    .join(tag_counts, on="tags")
    .filter(pl.col("tags").is_in(text_process.DET_TAGS).not_())
    .sort(pl.col("tokens").list.len())
)
display(tags_to_tokens.head(10))

tags,tokens,tags_count
str,list[str],u32
"""opas""","[""="", ""+="", ""<-""]",184
"""kwva""","[""let"", ""const"", … ""val""]",14
"""kwde""","[""class"", ""namespace"", … ""mod""]",8
"""opcm""","[""<"", "">="", … "">""]",22
"""bo""","[""False"", ""false"", … ""True""]",15
"""kwmo""","[""async"", ""pub"", … ""local""]",11
"""kwfn""","[""end"", ""function"", … ""fn""]",11
"""li""","[""on"", ""all"", … ""null""]",7
"""opun""","[""++"", ""-"", … ""+""]",23
"""kwio""","[""puts"", ""echo"", … ""clc""]",14


In [9]:
token_to_tags.filter(pl.col("tokens") == "except")

tokens,tags,tokens_count
str,list[str],u32
