In [None]:
import sys
from importlib import reload
import numpy as np

import polars as pl

sys.path.append("..")

import src.util as util
from src import text_process
from src import _constants as c
from src import data_lint

reload(util)
reload(text_process)
# set_plotly_template()

In [None]:
from pathlib import Path


examples = util.load_dataset_parallel(
    Path("../data/dataset.ndjson"),
    # filter_lang=["rust"],
)
print(f"loaded {len(examples)} examples")
df = util.dataset_to_df(examples)
df.head()

In [None]:
reload(data_lint)
res = data_lint.lint_data_df(df)
display(res)
res.get_step_report(1)

## tokens


In [None]:
token_counts = util.value_counts(df["tokens"].explode())
token_counts

## which tags can each token have?


In [None]:
token_to_tags = (
    df.select(pl.col("tokens", "tags").explode())
    .group_by("tokens")
    .agg(pl.col("tags").explode().unique())  # what tags can the token have?
).join(token_counts, on="tokens")

single_tagged = (
    token_to_tags.filter(pl.col("tags").list.len() == 1)
    .sort("tokens_count", descending=True)
    .with_columns(pl.col("tags").list[0])
    .filter(pl.col("tags").is_in(c.DET_TAGS).not_())
)  # exclude DET TAGS

multi_tagged = token_to_tags.filter(pl.col("tags").list.len() != 1).sort(
    "tokens_count", descending=True
)
print(f"found {len(single_tagged)} tokens with a single tag")

display(single_tagged.head(10))
print(f"found {len(multi_tagged)} tokens with multiple tags")
display(multi_tagged.head(10))


## which tokens can each tag have?


In [None]:
tag_counts = util.value_counts(df["tags"].explode())

tags_to_tokens = (
    (
        df.select(pl.col("tokens", "tags").explode())
        .group_by("tags")
        .agg(pl.col("tokens").explode().unique())  # what tags can the token have?
    )
    .join(tag_counts, on="tags")
    .filter(pl.col("tags").is_in(c.DET_TAGS).not_())
    .sort(pl.col("tokens").list.len())
)
display(tags_to_tokens.head(10))

## specific examples


In [None]:
token_to_tags.filter(pl.col("tokens") == "f")

In [None]:
tags_to_tokens.filter(pl.col("tags") == "kwty")["tokens"].to_list()

## search for a token


In [None]:
for ex in df.filter(pl.col("tokens").list.contains("implements")).iter_rows(named=True):
    print("\n" + ex["name"], "-" * 30)
    print("".join(ex["tokens"]))
    print(ex["tags"])

## duplicates


In [None]:
df.glimpse()

In [None]:
print("exact same tokens:")
display(df.group_by("tokens").agg("id", pl.len()).filter(pl.col("len") > 1))
print("exact same tags:")
display(df.group_by("tags").agg("id", pl.len()).filter(pl.col("len") > 1))

ids_same_tags = (
    df.group_by("tags")
    .agg("id", pl.len())
    .filter(pl.col("len") > 1)["id"]
    .explode()
    .to_list()
)

## finding (tag) sequences


In [None]:
query = "|cl|brop"


res = df.with_columns(
    ("|" + pl.col("tags").list.join("|") + "|").alias("tagstr")
).filter(pl.col("tagstr").str.contains(query, literal=True))
print(f"found {len(res)} matches")
for i, ex in enumerate(res.iter_rows(named=True)):
    print(f"\n{i}: " + ex["id"], "-" * 30)
    print("".join(ex["tokens"]))
    print(ex["tagstr"])
    print(query in ex["tagstr"])