In [None]:
from lz_embed.classical import BasicNGramSpectrum, AlphabetInfo, NGramSpectrumEmbedding
import matplotlib.pyplot as plt
import mteb

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
model = BasicNGramSpectrum(
    alpha_info=AlphabetInfo(valid_character_string="abcdefghijklmnopqrstuvwxyz"),
    n=2
)

In [None]:
plt.figure(figsize=(12,3))
plt.plot(model.encode([
    "Hello world hello world I am a hello of the world hello hello world world wow wow hello hello"
])[0])

In [None]:
plt.figure(figsize=(12,3))
plt.plot(model.encode([
    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
])[0])

## Try on some of the MTEB tasks

This does surprisingly well for a 2-gram model (3-gram is slow and the embeddings are prohibitively large). It does pretty poorly for some tasks, but this is a surprising result for something that is completely model-free.

**Next steps**:
- Do PCA to reduce the dimension. Does it help or hurt accuracy?
- Try this but for token counts. This must have been tried before?

In [None]:
model = NGramSpectrumEmbedding(
    alpha_info=AlphabetInfo(valid_character_string="abcdefghijklmnopqrstuvwxyz"),
    n=2, lowercase=True
)

In [None]:
tasks = mteb.get_tasks(tasks=["DBpediaClassification"])
evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(
    model, output_folder=f"results/test",
    show_progress_bar=True,
    overwrite_results=True
)
print("SCORE: ", results[0].scores["test"][0]["main_score"] * 100)

In [None]:
tasks = mteb.get_tasks(tasks=["AILAStatutes"])
evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(
    model, output_folder=f"results/test",
    show_progress_bar=True,
    overwrite_results=True
)
print("SCORE: ", results[0].scores["test"][0]["main_score"] * 100)
print("M2V_base_output (potion but worse): 12.725")

In [None]:
tasks = mteb.get_tasks(tasks=["TweetTopicSingleClassification"])
evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(
    model, output_folder=f"results/test",
    show_progress_bar=True,
    overwrite_results=True
)
print("SCORE: ", results[0].scores["test_2021"][0]["main_score"] * 100)
print("M2V_base_output (potion but worse): 46.21")

In [None]:
tasks = mteb.get_tasks(tasks=["PoemSentimentClassification"])
evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(
    model, output_folder=f"results/test",
    show_progress_bar=True,
    overwrite_results=True
)
print("SCORE: ", results[0].scores["test"][0]["main_score"] * 100)
print("M2V_base_output (potion but worse): 35.7")

In [None]:
tasks = mteb.get_tasks(tasks=["ArXivHierarchicalClusteringP2P"])
evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(
    model, output_folder=f"results/test",
    show_progress_bar=True,
    overwrite_results=True
)
print("SCORE: ", results[0].scores["test"][0]["main_score"] * 100)
print("M2V_base_output (potion but worse): 51.5")

In [None]:
tasks = mteb.get_tasks(tasks=["ArXivHierarchicalClusteringS2S"])
evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(
    model, output_folder=f"results/test",
    show_progress_bar=True,
    overwrite_results=True
)
print("SCORE: ", results[0].scores["test"][0]["main_score"] * 100)
print("M2V_base_output (potion but worse): 44.15")

In [None]:
tasks = mteb.get_tasks(tasks=["ArguAna"])
evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(
    model, output_folder=f"results/test",
    show_progress_bar=True,
    overwrite_results=True
)
print("SCORE: ", results[0].scores["test"][0]["main_score"] * 100)
print("M2V_base_output (potion but worse): 29.18")

In [None]:
tasks = mteb.get_tasks(tasks=["WikiCitiesClustering"])
evaluation = mteb.MTEB(tasks=tasks)

results = evaluation.run(
    model, output_folder=f"results/test",
    show_progress_bar=True,
    overwrite_results=True
)
print("SCORE: ", results[0].scores["test"][0]["main_score"] * 100)
print("M2V_base_output (potion but worse): 57.81")