In [None]:
from lz_embed.classical import BasicNGramSpectrum, AlphabetInfo, NGramSpectrumEmbedding
import matplotlib.pyplot as plt
import mteb
from sys import stdout
import numpy as np

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
model = BasicNGramSpectrum(
    alpha_info=AlphabetInfo(valid_character_string="abcdefghijklmnopqrstuvwxyz"),
    n=2
)

In [None]:
model.fixed_len

In [None]:
plt.figure(figsize=(12,3))
plt.plot(model.encode([
    "Hello world hello world I am a hello of the world hello hello world world wow wow hello hello"
])[0])

In [None]:
plt.figure(figsize=(12,3))
plt.plot(model.encode([
    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
])[0])

## Try on some of the MTEB tasks

This does surprisingly well for a 2-gram model (3-gram is slow and the embeddings are prohibitively large). It does pretty poorly for some tasks, but this is a surprising result for something that is completely model-free.

Results here: [link](https://docs.google.com/spreadsheets/d/11CBoRJ33tmXQqd50IVaWbFn915fD90Het3DKSTW8F78/edit?usp=sharing)

**Next steps**:
- Do PCA to reduce the dimension. Does it help or hurt accuracy?
- Try this but for token counts. This must have been tried before?

In [None]:
model = NGramSpectrumEmbedding(
    alpha_info=AlphabetInfo(valid_character_string="abcdefghijklmnopqrstuvwxyz"),
    n=2, lowercase=True
)

In [None]:
# with open("tasks.txt", "r") as f:
#     task_list = [x.strip() for x in f.readlines()]
task_list = [
    "ArXivHierarchicalClusteringP2P",
    "FinancialPhrasebankClassification",
    "PoemSentimentClassification"
]

In [None]:
outputs = []

for task in task_list:
    tasks = mteb.get_tasks(tasks=[task])
    evaluation = mteb.MTEB(tasks=tasks)

    results = evaluation.run(
        model, output_folder=f"results/test",
        show_progress_bar=False,
        overwrite_results=True
    )

    score = np.mean([results[0].scores[name][0]["main_score"] * 100 for name in results[0].scores])
    outputs.append(score)
    print("SCORE: ", score)
    stdout.flush()

In [None]:
print("\t".join(outputs))