In [212]:
import sys
from importlib import reload
import numpy as np

import polars as pl
from datatools import plotting as dtplot
from datatools import tabular as dttab

sys.path.append("..")

import src.util as util
from src import text_process

reload(util)
reload(text_process)
dtplot.set_plotly_template()

In [213]:
examples = util.load_examples_json(filter_lang=None)
examples.head(5)

Loaded 296 examples
No duplicates found :)


difficulty,tokens,tags,name,lang,id,length
str,list[str],list[str],str,str,str,u32
"""easy""","[""x"", ""="", ""1""]","[""va"", ""opas"", ""nu""]","""shrt""","""pseudo""","""shrt_pseudo""",3
"""easy""","[""["", ""2"", … ""]""]","[""brop"", ""nu"", … ""brcl""]","""smplrr""","""json""","""smplrr_json""",9
"""normal""","[""say"", "" "", """"Hello world""""]","[""kwio"", ""ws"", ""st""]","""hllwrld""","""natural""","""hllwrld_natural""",3
"""normal""","[""puts"", "" "", """"Hello World""""]","[""kwio"", ""ws"", ""st""]","""hllwrld""","""ruby""","""hllwrld_ruby""",3
"""normal""","[""if"", "" "", … "")""]","[""kwfl"", ""ws"", … ""brcl""]","""smplndnttb""","""pseudo""","""smplndnttb_pseudo""",8


## tokens


In [214]:
token_counts = dttab.value_counts(examples["tokens"].explode())
token_counts

tokens,tokens_count
str,u32
""" """,1882
""" """,587
""")""",548
"""(""",548
"""=""",324
…,…
"""""""""Container""""""""",1
"""!""",1
""" """,1
""" """,1


## which tags can each token have?


In [215]:
token_to_tags = (
    examples.select(pl.col("tokens", "tags").explode())
    .group_by("tokens")
    .agg(pl.col("tags").explode().unique())  # what tags can the token have?
).join(token_counts, on="tokens")

single_tagged = (
    token_to_tags.filter(pl.col("tags").list.len() == 1)
    .sort("tokens_count", descending=True)
    .with_columns(pl.col("tags").list[0])
    .filter(pl.col("tags").is_in(text_process.DET_TAGS).not_())
)  # exclude DET TAGS

multi_tagged = token_to_tags.filter(pl.col("tags").list.len() != 1).sort(
    "tokens_count", descending=True
)
print(f"found {len(single_tagged)} tokens with a single tag")
print(f"found {len(multi_tagged)} tokens with multiple tags")

display(single_tagged.head(10))
# display(multi_tagged.head(10))


found 1036 tokens with a single tag
found 110 tokens with multiple tags


tokens,tags,tokens_count
str,str,u32
""":""","""sy""",132
"""i""","""va""",59
"""if""","""kwfl""",47
"""return""","""kwfl""",44
"""::""","""sy""",42
"""for""","""kwfl""",40
"""in""","""kwop""",38
"""->""","""sy""",37
"""==""","""opcm""",33
"""import""","""kwim""",32


## which tokens can each tag have?


In [216]:
tag_counts = dttab.value_counts(examples["tags"].explode())

tags_to_tokens = (
    (
        examples.select(pl.col("tokens", "tags").explode())
        .group_by("tags")
        .agg(pl.col("tokens").explode().unique())  # what tags can the token have?
    )
    .join(tag_counts, on="tags")
    .filter(pl.col("tags").is_in(text_process.DET_TAGS).not_())
    .sort(pl.col("tokens").list.len())
)
display(tags_to_tokens.head(10))

tags,tokens,tags_count
str,list[str],u32
"""an""","[""@override"", ""'a""]",4
"""coml""","[""/**  * Double a number  *  * @param {number} x The number  * @return {number} x times two  */"", ""/**  * Get thse thing  */""]",2
"""kwva""","[""val"", ""const"", … ""let""]",46
"""shfl""","[""-nr"", ""--inodes"", … ""-d""]",4
"""kwfn""","[""def"", ""func"", … ""function""]",38
"""bo""","[""False"", ""false"", … ""true""]",35
"""opas""","[""="", ""*="", … ""<-""]",351
"""kwim""","[""as"", ""import"", … ""use""]",73
"""kwde""","[""mod"", ""php"", … ""class""]",20
"""coil""","[""// std::ops::RangeInclusive"", ""// std::ops::RangeToInclusive"", … ""// same as `const []`""]",7


## specific examples


In [217]:
token_to_tags.filter(pl.col("tokens") == "as")

tokens,tags,tokens_count
str,list[str],u32
"""as""","[""kwop"", ""kwim""]",15


In [218]:
tags_to_tokens.filter(pl.col("tags") == "kwty")["tokens"].to_list()

[['int',
  'long',
  'float',
  'string',
  'list',
  'u64',
  'code',
  'char',
  '_',
  'bool',
  'usize',
  'div',
  'tuple',
  'pre',
  'void',
  'i32',
  'double',
  'str']]

## search for a token


In [219]:
for ex in examples.filter(pl.col("tokens").list.contains("Constructor")).iter_rows(
    named=True
):
    print("\n" + ex["id"], "-" * 30)
    print("".join(ex["tokens"]))
    print(ex["tags"])


cntxmn_python ------------------------------
>>> with mymod.my_contextmanager():
...     a = mymod.submod.Constructor(mymod.get(33))
['sy', 'ws', 'kwfl', 'ws', 'mo', 'sy', 'fnas', 'brop', 'brcl', 'sy', 'nl', 'sy', 'ws', 'va', 'ws', 'opas', 'ws', 'mo', 'sy', 'mo', 'sy', 'clco', 'brop', 'mo', 'sy', 'fnas', 'brop', 'nu', 'brcl', 'brcl']


## duplicates


In [220]:
examples.glimpse()

Rows: 296
Columns: 7
$ difficulty       <str> 'easy', 'easy', 'normal', 'normal', 'normal', 'normal', 'easy', 'normal', 'normal', 'normal'
$ tokens     <list[str]> ['x', '=', '1'], ['[', '2', ',', ' ', '3', ',', ' ', '4', ']'], ['say', ' ', '"Hello world"'], ['puts', ' ', '"Hello World"'], ['if', ' ', 'true', '\n', '\t', 'something', '(', ')'], ['if', ' ', 'true', '\n', '  ', 'something', '(', ')'], ['x', ' ', '=', ' ', '337', '\n', 'y', ' ', '=', ' ', '99', '\n', 'z', ' ', '=', ' ', 'x', ' ', '+', ' ', 'y'], ['var', ' ', 'zz', ',', ' ', 'xy', ' ', 'int', ' ', '=', ' ', '11', ',', ' ', '33'], ['if', ' ', 'x', ' ', '==', ' ', '5', ':', '\n', '    ', 'print', '(', '"five"', ')'], ['while', ' ', 'True', ':', '\n', '    ', 'print', '(', '"loop!"', ')']
$ tags       <list[str]> ['va', 'opas', 'nu'], ['brop', 'nu', 'pu', 'ws', 'nu', 'pu', 'ws', 'nu', 'brcl'], ['kwio', 'ws', 'st'], ['kwio', 'ws', 'st'], ['kwfl', 'ws', 'bo', 'nl', 'id', 'fnfr', 'brop', 'brcl'], ['kwfl', 'ws', 'bo', 'nl', 'id',

In [221]:
print("exact same tokens:")
display(examples.group_by("tokens").agg("id", pl.len()).filter(pl.col("len") > 1))
print("exact same tags:")
display(examples.group_by("tags").agg("id", pl.len()).filter(pl.col("len") > 1))
display(
    examples.group_by("tags")
    .agg("id", pl.len())
    .filter(pl.col("len") > 1)["id"]
    .explode()
    .to_list()
)

exact same tokens:


tokens,id,len
list[str],list[str],u32


exact same tags:


tags,id,len
list[str],list[str],u32
"[""kwio"", ""ws"", ""st""]","[""hllwrld_natural"", ""hllwrld_ruby"", ""hlwrldd_bash""]",3
"[""kwfl"", ""ws"", … ""brcl""]","[""fhs_php"", ""rqsths_php""]",2
"[""kwfl"", ""ws"", … ""brcl""]","[""smplndnttb_pseudo"", ""smplndntsp_pseudo""]",2
"[""kwim"", ""ws"", … ""brcl""]","[""tldldxmplmprt_python"", ""crssmpr_python""]",2


['smplndnttb_pseudo',
 'smplndntsp_pseudo',
 'tldldxmplmprt_python',
 'crssmpr_python',
 'hllwrld_natural',
 'hllwrld_ruby',
 'hlwrldd_bash',
 'fhs_php',
 'rqsths_php']

## finding (tag) sequences


In [222]:
query = "|cl|brop"


res = examples.with_columns(
    ("|" + pl.col("tags").list.join("|") + "|").alias("tagstr")
).filter(pl.col("tagstr").str.contains(query, literal=True))
print(f"found {len(res)} matches")
for i, ex in enumerate(res.iter_rows(named=True)):
    print(f"\n{i}: " + ex["id"], "-" * 30)
    print("".join(ex["tokens"]))
    print(ex["tagstr"])
    print(query in ex["tagstr"])

found 3 matches

0: fntr_python ------------------------------
def make_item_list(ids: Iterable[int]) -> list[str]:
    return [f"item {x}" for x in ids]
|kwfn|ws|fnfr|brop|pa|sy|ws|cl|brop|kwty|brcl|brcl|ws|sy|ws|kwty|brop|kwty|brcl|sy|nl|id|kwfl|ws|brop|opun|st|ws|kwfl|ws|va|ws|kwop|ws|pa|brcl|
True

1: gntr_python ------------------------------
def gen(n: int) -> Iterator[int]:
    i = 0
    while i < n:
        yield i
        i += 1
|kwfn|ws|fnfr|brop|pa|sy|ws|kwty|brcl|ws|sy|ws|cl|brop|kwty|brcl|sy|nl|id|va|ws|opas|ws|nu|nl|id|kwfl|ws|va|ws|opcm|ws|pa|sy|nl|id|kwfl|ws|va|nl|id|va|ws|opas|ws|nu|
True

2: trnsfmcnst_python ------------------------------
class TransformerTagger(nn.Transformer):
    """Container"""

    def __init__(self, ntoken, n_embd, nhead):
        super(TransformerTagger, self).__init__(d_model=n_embd, nhead=nhead)
        # ...
|kwde|ws|cl|brop|mo|sy|cl|brcl|sy|nl|id|st|nl|id|kwfn|ws|fnfr|brop|pa|pu|ws|pa|pu|ws|pa|pu|ws|pa|brcl|sy|nl|id|fnfr|brop|cl|pu|ws|pa|b

## Language specific rules!


In [223]:
# in java, "System" is a class