In [10]:
import sys
from importlib import reload
import numpy as np

import polars as pl
from datatools import plotting as dtplot
from datatools import tabular as dttab

sys.path.append("..")

import plotting
import src.util as util
from src import text_process
from src.data_functions import make_example_groups, data_split

reload(util)
reload(plotting)
reload(text_process)
dtplot.set_plotly_template()

In [11]:
examples = util.load_examples_json()
examples.head(5)

Loaded 206 examples


difficulty,tokens,tags,name,lang,id,length
str,list[str],list[str],str,str,str,u32
"""easy""","[""x"", ""="", ""1""]","[""va"", ""opas"", ""nu""]","""shrt""","""pseudo""","""shrt_pseudo""",3
"""easy""","[""["", ""2"", … ""]""]","[""brop"", ""nu"", … ""brcl""]","""smplrr""","""json""","""smplrr_json""",9
"""normal""","[""say"", "" "", """"Hello world""""]","[""kwio"", ""ws"", ""st""]","""hllwrld""","""natural""","""hllwrld_natural""",3
"""normal""","[""puts"", "" "", """"Hello World""""]","[""kwio"", ""ws"", ""st""]","""hllwrld""","""ruby""","""hllwrld_ruby""",3
"""normal""","[""if"", "" "", … "")""]","[""kwfl"", ""ws"", … ""brcl""]","""smplndnttb""","""pseudo""","""smplndnttb_pseudo""",8


## tokens


In [12]:
token_counts = dttab.value_counts(examples["tokens"].explode())
token_counts

tokens,tokens_count
str,u32
""" """,1495
""" """,474
""")""",395
"""(""",395
""",""",296
…,…
"""""$x + $y = $z\n""""",1
"""""$a, $b""""",1
"""""# questions loaded""""",1
"""!""",1


## which tags can each token have?


In [13]:
token_to_tags = (
    examples.select(pl.col("tokens", "tags").explode())
    .group_by("tokens")
    .agg(pl.col("tags").explode().unique())  # what tags can the token have?
).join(token_counts, on="tokens")

single_tagged = (
    token_to_tags.filter(pl.col("tags").list.len() == 1)
    .sort("tokens_count", descending=True)
    .with_columns(pl.col("tags").list[0])
    .filter(pl.col("tags").is_in(text_process.DET_TAGS).not_())
)  # exclude DET TAGS

multi_tagged = token_to_tags.filter(pl.col("tags").list.len() != 1).sort(
    "tokens_count", descending=True
)
print(f"found {len(single_tagged)} tokens with a single tag")
print(f"found {len(multi_tagged)} tokens with multiple tags")

display(single_tagged.head(10))
# display(multi_tagged.head(10))


found 796 tokens with a single tag
found 70 tokens with multiple tags


tokens,tags,tokens_count
str,str,u32
"""=""","""opas""",263
""".""","""sy""",206
""":""","""sy""",89
"""i""","""va""",48
"""if""","""kwfl""",38
"""return""","""kwfl""",34
"""import""","""kwim""",33
"""for""","""kwfl""",31
"""==""","""opcm""",29
"""::""","""sy""",29


## which tokens can each tag have?


In [14]:
tag_counts = dttab.value_counts(examples["tags"].explode())

tags_to_tokens = (
    (
        examples.select(pl.col("tokens", "tags").explode())
        .group_by("tags")
        .agg(pl.col("tokens").explode().unique())  # what tags can the token have?
    )
    .join(tag_counts, on="tags")
    .filter(pl.col("tags").is_in(text_process.DET_TAGS).not_())
    .sort(pl.col("tokens").list.len())
)
display(tags_to_tokens.head(10))

tags,tokens,tags_count
str,list[str],u32
"""coml""","[""/**  * Get thse thing  */""]",1
"""coil""","[""# array([2, 5], dtype=int32)"", ""# array([9, 5], dtype=int32)"", ""# array([[1, 9, 3], [4, 5, 6]], dtype=int32)""]",3
"""kwva""","[""val"", ""let"", … ""const""]",35
"""bo""","[""True"", ""False"", … ""FALSE""]",28
"""kwfn""","[""fn"", ""end"", … ""func""]",21
"""li""","[""null"", ""xy"", … ""tight""]",9
"""opas""","[""<-"", ""/="", … ""*=""]",284
"""kwim""","[""package"", ""echo"", … ""as""]",70
"""opcm""","[""=="", ""!="", … ""<""]",52
"""kwde""","[""extends"", ""throws"", … ""use""]",17


In [15]:
token_to_tags.filter(pl.col("tokens") == "with")

tokens,tags,tokens_count
str,list[str],u32
"""with""","[""kwfl""]",1


## Language specific rules!


In [16]:
# in java, "System" is a class