In [None]:
import sys
from importlib import reload
import numpy as np

import polars as pl
from datatools import plotting as dtplot
from datatools import tabular as dttab

sys.path.append("..")

import src.util as util
from src import text_process

reload(util)
reload(text_process)
dtplot.set_plotly_template()

In [3]:
examples = util.load_examples_json(filter_lang=["rust"])
examples.head(5)

Loaded 15 examples
No duplicates found :)


difficulty,tokens,tags,name,lang,id,length
str,list[str],list[str],str,str,str,u32
"""normal""","[""pub"", "" "", … "";""]","[""kwmo"", ""ws"", … ""pu""]","""lbrsfl""","""rust""","""lbrsfl_rust""",20
"""normal""","[""let"", "" "", … "";""]","[""kwva"", ""ws"", … ""pu""]","""prsbgnt""","""rust""","""prsbgnt_rust""",13
"""ambiguous""","[""use"", "" "", … "";""]","[""kwim"", ""ws"", … ""pu""]","""sstmnts""","""rust""","""sstmnts_rust""",28
"""ambiguous""","[""use"", "" "", … "";""]","[""kwim"", ""ws"", … ""pu""]","""mprtfns""","""rust""","""mprtfns_rust""",20
"""normal""","[""// Move to the next pivot row"", "" "", … ""}""]","[""cofl"", ""nl"", … ""brcl""]","""stpssrwlp""","""rust""","""stpssrwlp_rust""",26


## tokens


In [4]:
token_counts = dttab.value_counts(examples["tokens"].explode())
token_counts

tokens,tokens_count
str,u32
""" """,109
""" """,32
""";""",22
""")""",22
"""(""",22
…,…
"""// Move to the next pivot row""",1
"""+=""",1
"""+""",1
""" """,1


## which tags can each token have?


In [5]:
token_to_tags = (
    examples.select(pl.col("tokens", "tags").explode())
    .group_by("tokens")
    .agg(pl.col("tags").explode().unique())  # what tags can the token have?
).join(token_counts, on="tokens")

single_tagged = (
    token_to_tags.filter(pl.col("tags").list.len() == 1)
    .sort("tokens_count", descending=True)
    .with_columns(pl.col("tags").list[0])
    .filter(pl.col("tags").is_in(text_process.DET_TAGS).not_())
)  # exclude DET TAGS

multi_tagged = token_to_tags.filter(pl.col("tags").list.len() != 1).sort(
    "tokens_count", descending=True
)
print(f"found {len(single_tagged)} tokens with a single tag")
print(f"found {len(multi_tagged)} tokens with multiple tags")

display(single_tagged.head(10))
# display(multi_tagged.head(10))


found 95 tokens with a single tag
found 5 tokens with multiple tags


tokens,tags,tokens_count
str,str,u32
"""::""","""sy""",14
""":""","""sy""",11
""".""","""sy""",10
"""=""","""opas""",9
""">""","""sy""",7
"""<""","""sy""",7
"""|""","""sy""",6
"""let""","""kwva""",6
"""HashSet""","""cl""",6
"""&""","""opun""",6


## which tokens can each tag have?


In [6]:
tag_counts = dttab.value_counts(examples["tags"].explode())

tags_to_tokens = (
    (
        examples.select(pl.col("tokens", "tags").explode())
        .group_by("tags")
        .agg(pl.col("tokens").explode().unique())  # what tags can the token have?
    )
    .join(tag_counts, on="tags")
    .filter(pl.col("tags").is_in(text_process.DET_TAGS).not_())
    .sort(pl.col("tokens").list.len())
)
display(tags_to_tokens.head(10))

tags,tokens,tags_count
str,list[str],u32
"""kwfn""","[""fn""]",4
"""an""","[""'a""]",3
"""opbi""","[""+""]",1
"""clco""","[""Some""]",1
"""at""","[""x""]",1
"""opas""","[""+="", ""=""]",10
"""kwva""","[""const"", ""let""]",8
"""kwmo""","[""pub"", ""mut""]",8
"""opun""","[""&"", ""*""]",7
"""kwim""","[""crate"", ""use""]",4


In [11]:
token_to_tags.filter(pl.col("tokens") == "!")

tokens,tags,tokens_count
str,list[str],u32


## duplicates


In [8]:
examples.glimpse()

Rows: 15
Columns: 7
$ difficulty       <str> 'normal', 'normal', 'ambiguous', 'ambiguous', 'normal', 'normal', 'normal', 'ambiguous', 'ambiguous', 'normal'
$ tokens     <list[str]> ['pub', ' ', 'mod', ' ', 'factorize', ';', '\n', 'pub', ' ', 'mod', ' ', 'linalg', ';', '\n', 'pub', ' ', 'mod', ' ', 'util', ';'], ['let', ' ', 'n', ' ', '=', ' ', 'BigUint', '::', 'from', '(', '92434447339770015548544881401_u128', ')', ';'], ['use', ' ', 'num', '::', '{', 'integer', '::', 'Roots', ',', ' ', 'BigUint', ',', ' ', 'One', ',', ' ', 'Zero', '}', ';', '\n', 'use', ' ', 'rayon', '::', 'prelude', '::', '*', ';'], ['use', ' ', 'crate', '::', '{', 'linalg', '::', 'left_null', ',', ' ', 'math_fun', '::', 'gcd_euclid_bigu', ',', ' ', 'util', '::', 'format_time', '}', ';'], ['// Move to the next pivot row', '\n', 'row', ' ', '+=', ' ', '1', ';', '\n\n', "// Stop if we've processed all rows", '\n', 'if', ' ', 'row', ' ', '>=', ' ', 'm', ' ', '{', '\n', '    ', 'break', ';', '\n', '}'], ['/// Maximum val

In [9]:
print("exact same tokens:")
display(examples.group_by("tokens").agg("id", pl.len()).filter(pl.col("len") > 1))
print("exact same tags:")
display(examples.group_by("tags").agg("id", pl.len()).filter(pl.col("len") > 1))
display(
    examples.group_by("tags")
    .agg("id", pl.len())
    .filter(pl.col("len") > 1)["id"]
    .explode()
    .to_list()
)

exact same tokens:


tokens,id,len
list[str],list[str],u32


exact same tags:


tags,id,len
list[str],list[str],u32


[]

## Language specific rules!


In [10]:
# in java, "System" is a class