In [6]:
import pandas as pd
import numpy as np

In [2]:
dataset_filepath = "../data/folha_2013_2023_clean_cats.zip"
folha_filtered = pd.read_csv(dataset_filepath)
print(folha_filtered.columns)
print(folha_filtered.shape)

Index(['title', 'text', 'date', 'category', 'subcategory', 'link'], dtype='object')
(354534, 6)


In [8]:
def calculate_metrics(df):
    # calculate descriptive statistics
    metrics = df.groupby('category')['text'] \
                .apply(lambda x: x.str.len().describe()) \
                .reset_index()
    metrics = pd.pivot_table(
        metrics,
        values = "text",
        index = "category",
        columns = "level_1"
        ) \
        .add_suffix("_len_text") \
        .reset_index()
    
    # calculate percentage
    metrics.rename(columns = {'count_len_text' : 'count'}, inplace = True)
    pct = pd.DataFrame(df['category'].value_counts(normalize = True))

    # calculate mean title length
    mean_len_title = df.groupby('category')['title'] \
                       .apply(lambda x: np.mean(x.str.len())) \
                       .rename("mean_len_title") \
                       .round() \
                       .astype(int)

    # join metrics tables
    pct_mean_len_title = pct.join(mean_len_title)
    metrics = metrics.merge(
        pct_mean_len_title,
        left_on = "category",
        right_index = True
        )
    metrics['count'] = metrics['count'].astype(int)
    metrics = metrics[
        [
            "category", "count", "proportion", "mean_len_title", "25%_len_text", 
            "50%_len_text", "75%_len_text", "max_len_text", 
            "mean_len_text", "min_len_text", "std_len_text"
        ]
    ].sort_values(by = "count", ascending = False)

    return metrics

In [9]:
calculate_metrics(folha_filtered)

Unnamed: 0,category,proportion,count,mean_len_title,25%_len_text,50%_len_text,75%_len_text,max_len_text,mean_len_text,min_len_text,std_len_text
12,mercado,0.175647,62273,69,2053.0,3126.0,4566.0,102232.0,3606.385223,1.0,2348.604586
15,poder,0.145944,51742,72,2251.0,3389.5,5175.0,215128.0,4009.416296,4.0,3003.991967
11,internacional,0.14593,51737,71,1679.0,2866.0,4485.0,50032.0,3433.154512,95.0,2414.763556
5,cotidiano,0.115569,40973,70,1844.0,2837.0,4291.0,37019.0,3279.664828,85.0,2030.777917
9,esporte,0.100405,35597,68,1431.0,2266.0,3406.0,57574.0,2703.179088,4.0,1890.974193
1,celebridades,0.091794,32544,74,888.0,1336.0,1864.0,20678.0,1571.867072,88.0,1204.813902
17,televisao,0.074188,26302,73,710.0,1231.0,2033.0,34127.0,1612.796936,4.0,1417.049321
8,equilibrio-e-saude,0.038623,13693,74,2763.0,4213.0,5920.0,31998.0,4690.730519,228.0,2801.268556
3,cinema-e-series,0.017211,6102,76,1291.0,1988.0,3587.0,52737.0,3109.221075,172.0,3512.347886
6,educacao,0.014583,5170,70,2455.0,3624.5,5020.0,33713.0,3926.679497,160.0,2092.07335
