In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from pympler import asizeof
import numpy as np
warnings.filterwarnings("ignore")


plt.rcParams.update({
    "text.usetex": True,              # If you want to use LaTeX fonts in text
    "font.family": "serif",           # IEEE uses Times New Roman-like fonts
    "font.size": 12,                   # General font size for IEEE format
    "axes.labelsize": 12,              # Axis labels font size
    "legend.fontsize": 11,             # Legend font size
    "xtick.labelsize": 11,             # X-axis tick labels font size
    "ytick.labelsize": 11,             # Y-axis tick labels font size
    "figure.figsize": (5, 5*0.8),
    "savefig.dpi": 1200,               # DPI for high-quality PNG/JPEG
})

def latex_table(df):
    latex_table = df.to_latex(
        index=False,
        escape=False,
        column_format='ll',  # Define column alignment
        caption='CAPTION CAPTION',
        label='table:LABEL_LABEL'
    )
    latex_table = latex_table.replace("\\begin{table}", "\\begin{table}[H]\n\\centering")
    print(latex_table)

colors_scatter = ["#ff7f0e", "green", "#1f77b4", "red"]
colors_plot = ["#ffbb78", "lightgreen", "#aec7e8", "#FFB6B6"]

---
---
---

In [15]:
exact_counts = pd.read_excel("results/exact_counts.xlsx", index_col=0, header=[0,1])

approximate_counts = pd.read_excel("results/approximate_counts.xlsx", index_col=0, header=[0,1])

SS10_counts = pd.read_excel("results/SS10_counts.xlsx", index_col=0, header=[0,1])
SS70_counts = pd.read_excel("results/SS70_counts.xlsx", index_col=0, header=[0,1])
SS150_counts = pd.read_excel("results/SS150_counts.xlsx", index_col=0, header=[0,1])

## memoria

In [16]:
table = {"alg": [],
         "EN": [],
         "IT": [],
         "FI": [],}

table["alg"].append("Exact")
for col in exact_counts.columns.levels[0]:
    sub_df = exact_counts[col]
    sub_df = sub_df.dropna()
    size = asizeof.asizeof(sub_df)
    table[col].append(size)

table["alg"].append("Approximate")
def get_approximate_counts(idioma):
    """separate languages and make divide by 16 because thats how it is counted"""
    df = approximate_counts.xs(key=idioma, level=0, axis=1)
    words = df.filter(like='word').stack().reset_index(drop=True)
    counts = df.filter(like='count').stack().reset_index(drop=True)
    result = pd.DataFrame({'word': words, 'count': counts})
    result = result.groupby('word')['count'].agg(['mean']).reset_index()
    result["mean"] = result["mean"].map(lambda x: int(x//16))
    result.dropna(inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result
for col in approximate_counts.columns.levels[0]:
    sub_df = get_approximate_counts(col)
    size = asizeof.asizeof(sub_df)
    table[col].append(size)

table["alg"].append("SS10")
for col in SS10_counts.columns.levels[0]:
    sub_df = SS10_counts[col]
    sub_df = sub_df.dropna()
    size = asizeof.asizeof(sub_df)
    table[col].append(size)

table["alg"].append("SS70")
for col in SS70_counts.columns.levels[0]:
    sub_df = SS70_counts[col]
    sub_df = sub_df.dropna()
    size = asizeof.asizeof(sub_df)
    table[col].append(size)

table["alg"].append("SS150")
for col in SS150_counts.columns.levels[0]:
    sub_df = SS150_counts[col]
    sub_df = sub_df.dropna()
    size = asizeof.asizeof(sub_df)
    table[col].append(size)

df = pd.DataFrame(table).T
df.columns = df.iloc[0]
df = df[1:]

#latex_table(df)

## erro

- por lingua por palavra

    - comparar ranks das palavras (linha) e coluna é o alg

    - comparar valor de contagem palavras (linha) e coluna é o alg

- lingua na linha, alg na coluna, precisao na celula

In [17]:
def aprox_count(idioma):
    """separate languages and make divide by 16 because thats how it is counted"""
    df = approximate_counts.xs(key=idioma, level=0, axis=1)
    words = df.filter(like='word').stack().reset_index(drop=True)
    counts = df.filter(like='count').stack().reset_index(drop=True)
    result = pd.DataFrame({'word': words, 'count': counts})
    result = result.groupby('word')['count'].agg(['mean']).reset_index()
    result["mean"] = result["mean"].map(lambda x: int(x))
    result.dropna(inplace=True)
    result.sort_values(by='mean', ascending=False, inplace=True)
    result.reset_index(drop=True, inplace=True)
    return result


def ranking_table(idioma):
    ranks = exact_counts[idioma][["word"]].iloc[:10]
    ranks["exact"] = range(1, 11)
    aprox = []
    ss10 = []
    ss70 = []
    ss150 = []
    for index, row in ranks.iterrows():
        aprox.append(aprox_count(idioma)[aprox_count(idioma)["word"] == row["word"]].index.tolist()[0] + 1)
        try:
            ss10.append(SS10_counts[idioma][SS10_counts[idioma]["word"] == row["word"]].index.tolist()[0] + 1)
        except:
            ss10.append(0)
        try:
            ss70.append(SS70_counts[idioma][SS70_counts[idioma]["word"] == row["word"]].index.tolist()[0] + 1)
        except:
            ss70.append(0)
        ss150.append(SS150_counts[idioma][SS150_counts[idioma]["word"] == row["word"]].index.tolist()[0] + 1)
    ranks["aprox"] = aprox
    ranks["ss10"] = ss10
    ranks["ss70"] = ss70
    ranks["ss150"] = ss150
    return ranks


#latex_table(ranking_table("EN"))
#latex_table(ranking_table("IT"))
#latex_table(ranking_table("FI"))

In [18]:
def count_table(idioma):
    ranks = exact_counts[idioma].iloc[:10]
    ranks = ranks.rename(columns={'count': 'exact'})
    ranks["exact"] = ranks["exact"].map(lambda x: int(x))
    aprox = []
    ss10 = []
    ss70 = []
    ss150 = []
    for index, row in ranks.iterrows():
        aprox.append(aprox_count(idioma)[aprox_count(idioma)["word"] == row["word"]]["mean"].tolist()[0])
        try:
            ss10.append(SS10_counts[idioma][SS10_counts[idioma]["word"] == row["word"]]["count"].tolist()[0])
        except:
            ss10.append(0)
        try:
            ss70.append(SS70_counts[idioma][SS70_counts[idioma]["word"] == row["word"]]["count"].tolist()[0])
        except:
            ss70.append(0)
        ss150.append(SS150_counts[idioma][SS150_counts[idioma]["word"] == row["word"]]["count"].tolist()[0])
    ranks["aprox"] = aprox
    ranks["ss10"] = ss10
    ranks["ss70"] = ss70
    ranks["ss150"] = ss150
    return ranks

count_table("EN")
count_table("IT")
count_table("FI")

#  absolute and relative errors

# em vez de ter os valores exatos meter +- ns qnt e a % ?

Unnamed: 0,word,exact,aprox,ss10,ss70,ss150
0,pinocchio,443,445,0,443,443
1,sanoa,258,266,0,266,261
2,saada,143,160,0,255,143
3,alkaa,134,104,0,255,136
4,tehdä,134,136,0,255,135
5,marionetti,131,132,1807,256,138
6,poika,81,70,0,0,118
7,huutaa,81,86,0,0,116
8,nähdä,80,71,0,0,118
9,kysyä,77,72,0,0,118


In [19]:
df = {"idioma": [],
      "exact": [],
      "aprox": [],
      "ss10": [],
      "ss70": [],
      "ss150": []}

for idioma in ["EN", "IT", "FI"]:
    counts_t10 = count_table(idioma)
    for key in df.keys():
        if key == "idioma":
            df[key].append(idioma)
        else:
            df[key].append(((counts_t10[key] - counts_t10["exact"]).abs() / counts_t10["exact"]).mean())


df = pd.DataFrame(df).round(3)
#latex_table(df)