In [40]:
import pandas as pd
from transformers import AutoTokenizer

In [41]:
tokenizer = AutoTokenizer.from_pretrained("melll-uff/bertweetbr")

In [42]:
def load_and_combine_csvs(csv_files):
    dataframes = [pd.read_csv(file) for file in csv_files]
    combined_df = pd.concat(dataframes, ignore_index=True)
    return combined_df

In [43]:
def calculate_metrics(sentences):
    total_chars = 0
    total_words = 0
    total_tokens = 0
    
    for sentence in sentences:
        if pd.isna(sentence): 
            continue
        total_chars += len(sentence)
        total_words += len(sentence.split())
        total_tokens += len(tokenizer.tokenize(sentence))
    
    num_sentences = len(sentences)
    avg_chars = total_chars / num_sentences
    avg_words = total_words / num_sentences
    avg_tokens = total_tokens / num_sentences
    
    return avg_chars, avg_words, avg_tokens


### HateBR

In [44]:
csv_files = ["hatebr\\hatebr_train_umb_aux.csv", "hatebr\\hatebr_val.csv", "hatebr\\hatebr_test.csv"]

In [45]:
combined_df = load_and_combine_csvs(csv_files)

In [46]:
texts = combined_df['text'].dropna().tolist()

In [48]:
avg_chars, avg_words, avg_tokens = calculate_metrics(texts)
print(avg_chars, avg_words, avg_tokens)

85.65564575645756 14.593210332103322 15.8309963099631


### OLIDBR

In [49]:
csv_files = ["olidbr\\olidbr_train_balanced.csv", "olidbr\\olidbr_val.csv", "olidbr\\olidbr_test.csv"]

In [50]:
combined_df = load_and_combine_csvs(csv_files)

In [51]:
texts = combined_df['text'].dropna().tolist()

In [52]:
avg_chars, avg_words, avg_tokens = calculate_metrics(texts)
print(avg_chars, avg_words, avg_tokens)

134.16826086956522 26.199565217391303 28.39108695652174


### ToLD-BR

In [53]:
csv_files = ["toldbr\\toldbr_train_balanced.csv", "toldbr\\toldbr_val.csv", "toldbr\\toldbr_test.csv"]

In [54]:
combined_df = load_and_combine_csvs(csv_files)

In [55]:
texts = combined_df['text'].dropna().tolist()

In [56]:
avg_chars, avg_words, avg_tokens = calculate_metrics(texts)
print(avg_chars, avg_words, avg_tokens)

90.93240442192538 17.395555043758637 18.780285582680794
