To justify the use of q-grams over traditional word-level tokenization, a two-stage experiment was conducted. In the first stage, a classifier was trained using Bag-of-Words (BoW) features derived from standard word-level tokens. In the second stage, the same experimental setup was applied using q-gram tokenization instead.

The results are visualized in a scatter plot, where each point represents the F1-score of a model evaluated on a specific dataset. The x-axis shows performance using word-level tokenization (baseline), while the y-axis shows performance using q-grams. The diagonal line represents equal performance between both methods.

Points above the diagonal indicate performance improvement with q-grams, while points below indicate a drop in performance. The overall distribution of points reveals a clear trend favoring q-grams, supporting their effectiveness in handling informal language variations and sparsity in short-text NLP tasks such as tweet classification.

In [None]:
try:
    import microtc
except ImportError:
    !pip install microtc

In [None]:
from microtc import TextModel
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, recall_score, precision_score
from glob import glob
import warnings
warnings.filterwarnings('ignore')
from microtc.utils import tweet_iterator
from scipy.stats import ttest_rel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

In [None]:
def performance(y, hy):
    funcs =  [f1_score, recall_score, precision_score]
    B = []
    for _ in range(500):
        s = np.random.randint(y.shape[0], size=y.shape[0])
        # s tiene los índices con reemplazo (repetidos) para usarlos en el arreglos de y_test y hy como re-muestreo para Boostraping
        _ = [func(y[s], hy[s]) for func in funcs] # f1_score(y_true, y_pred). aplicar las 3 métricas a y_test y hy re-ordenados por s
        B.append(_)
    B = np.array(B)
    medida = [func(y, hy) for func in funcs]
    error_estandar = B.std(axis=0) # desviación de las 3 métricas en los 500 renglones
    return medida, error_estandar.tolist()

In [None]:
if False:
    text_model_words = TextModel(token_list=[ -1], del_diac=False, num_option='delete', del_punc=False, url_option='delete', del_dup=True, lc=True, hashtag_option=None, q_grams_words=True)
    print(text_model_words.tokenize('¿Hola Mundo Cruel villáno?'))

    token_list = [-1, 2, 3, 4, (2, 1)]
    #token_list = [-1, 2, 3, 4]
    text_model_grams = TextModel(token_list=token_list, del_diac=True, num_option='delete', del_punc=True, url_option='delete', del_dup=False, lc=True, hashtag_option=None, q_grams_words=False)
    print(text_model_grams.tokenize('Hola Mundo cruel villáno?'))

    #linearSVC = LinearSVC(penalty='l1', C=1.0, dual=False, max_iter=10000, random_state = 42)


In [None]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

stopwords.words('spanish')[:10]

In [None]:
stop_words_sp = set(stopwords.words('spanish'))

In [None]:
text_model_words = TextModel(token_list=[-1], del_diac=False, num_option=None, del_punc=False, url_option='delete', del_dup=False, lc=True, hashtag_option=None, q_grams_words=True)

# token_list = [2, 3, 4]
token_list = [-1, 2, 3, 4]
text_model_grams = TextModel(token_list=token_list, del_diac=True, num_option='delete', del_punc=True, url_option='delete', del_dup=True, lc=True, hashtag_option=None, q_grams_words=False)


In [None]:
# This cell will likely fail or produce an empty plot as df is not loaded.
import matplotlib.pyplot as plt
import pandas as pd # Added import for pd
try:
    # Attempt to create an empty DataFrame if 'df' is not defined
    if 'df' not in locals() and 'df' not in globals():
        print("DataFrame 'df' not defined. Creating an empty DataFrame for plot generation.")
        df = pd.DataFrame(columns=['Dataset', 'F1_d', 'Precision_d', 'Recall_d', 'F1', 'F1_q'])
    
    dff = df[["Dataset", "F1_d", "Precision_d", "Recall_d"]]
    dff["Dataset"] = dff["Dataset"].str.replace("grams/", "", regex=False)
    dff = dff.sort_values(by='F1_d', ascending=True)
    print(dff)
except NameError:
    print("DataFrame 'df' not defined due to removal of data loading cells.")
    dff = pd.DataFrame(columns=["Dataset", "F1_d", "Precision_d", "Recall_d"]) # Create empty df for plotting

colors = ['green' if delta >= 0 else 'orange' for delta in dff['F1_d']]

dff['Acronym'] = ['' + str(i+1) for i in range(len(dff))]

plt.figure(figsize=(6.5, 4.5))
plt.barh(dff["Dataset"], dff["F1_d"], color=colors) # Changed from dff.plot to plt.barh for simplicity with potentially empty df

plt.xlabel("Differences in Metric Scores")
plt.ylabel("Dataset")
plt.title("Impact of q-gram tokenization in performance metrics\nacross multiple datasets", fontsize=11)
#plt.legend(["f1-score", "precision", "recall"], title='Metrics', title_fontsize='9') # Legend won't work well with barh like this if F1_d is the only y
plt.tight_layout()

plt.show()

print(len(dff))

In [None]:
# This cell will also likely fail as df is not loaded.
import pandas as pd # Added import for pd
try:
    # Attempt to create an empty DataFrame if 'df' is not defined
    if 'df' not in locals() and 'df' not in globals():
        print("DataFrame 'df' not defined. Creating an empty DataFrame.")
        df = pd.DataFrame(columns=['Dataset', 'F1_d', 'Precision_d', 'Recall_d', 'F1', 'F1_q'])
    df['Dataset'] = df['Dataset'].str.replace('grams/', '', regex=False)
    print("Processed df['Dataset']")
except NameError:
    print("DataFrame 'df' not defined.")

In [None]:
# This cell will also likely fail as df is not loaded.
import matplotlib.pyplot as plt
import pandas as pd # Added import for pd
try:
    # Attempt to create an empty DataFrame if 'df' is not defined
    if 'df' not in locals() and 'df' not in globals():
        print("DataFrame 'df' not defined. Creating an empty DataFrame for plot generation.")
        df = pd.DataFrame(columns=['Dataset', 'F1_d', 'Precision_d', 'Recall_d', 'F1', 'F1_q'])

    colors = ['green' if delta >= 0 else 'orange' for delta in df['F1_d']]
    df['Acronym'] = ['' + str(i+1) for i in range(len(df))]
    plt.figure(figsize=(10, 7))
    plt.scatter(df.F1, df.F1_q, color=colors)
    for i, row in df.iterrows():
        plt.text(row['F1'] + 0.002, row['F1_q'], row['Acronym'])
    plt.plot([0.5, 0.9], [0.5, 0.9], linestyle='--', color='gray', label='y = x')
    plt.text(0.62, 0.61, 'Baseline (y = x)', color='gray', fontsize=11, rotation=34)
    plt.xlabel("F1-score without q-grams (baseline)")
    plt.ylabel("F1-score with q-grams")
    plt.title("Effect of q-grams on F1-score across datasets")
    plt.grid(True)
    labels = [f"{row['Acronym']}: {row['Dataset']}" for _, row in df.iterrows()]
    legend_text = "Datasets:\n" + "\n".join(labels)
    props = dict(boxstyle='round', facecolor='white', alpha=0.8)
    plt.gcf().text(0.70, 0.10, legend_text, fontsize=10, bbox=props)
    plt.tight_layout()
    plt.show()
except NameError:
    print("DataFrame 'df' not defined. Plotting will be skipped.")
except KeyError as e:
    print(f"KeyError: {e} - This likely means the DataFrame 'df' is empty or missing expected columns. Plotting will be skipped.")