In [1]:
import pandas as pd
import sys
import glob
import os

from tqdm import tqdm

sys.path.append("../../../../utils")
from absolute_path_builder import AbsolutePathBuilder

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
# DATASET = "youtube"
# DATASET = "coraal_vs_buckeye"
DATASET = "twitter"

if DATASET == "twitter":
    pvalue = 0.05
    n_largest = 50
else:
    pvalue = 0.01
    n_largest = 25

input_path = AbsolutePathBuilder.get_path(
    f"06_{DATASET}_regression",
    filepaths="../../../../config/filepaths.yaml"
)

In [4]:
df = None
for SCORE_MODEL in tqdm(
    [
        "perspective",
        "flair",
        "textblob",
        "vader",
        "detoxify_original",
        "detoxify_unbiased",
    ]
):
    df_model = pd.read_csv(os.path.join(input_path, f"{SCORE_MODEL}_score_coefs.csv"))
    
    # Select the most relevant columns according to the p-value and the coefficient
    df_model = (
        df_model[df_model.pvalue < pvalue]
        .sort_values(
            "coef",
            key=lambda item: abs(item),
            ascending=False
        ).iloc[:n_largest]
        [["column", "coef"]]
    )

    df_model = df_model.rename(columns={"coef": f"{SCORE_MODEL}"})
    
    if df is None:
        df = df_model
    else:
        df = df.merge(df_model, on="column", how="outer")

100%|██████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 133.83it/s]


In [5]:
df_result = df.dropna(thresh=4).reset_index(drop=True)

In [6]:
df_result

Unnamed: 0,column,perspective,flair,textblob,vader,detoxify_original,detoxify_unbiased
0,LIWC_SWEAR,0.8567,,,,0.9492,1.2792
1,LIWC_SEXUAL,0.4657,-1.3448,-0.3354,,0.5942,0.5609
2,LIWC_NETSPEAK,-0.4496,-2.6379,,,-0.8121,-0.9239
3,LIWC_INFORMAL,0.4386,2.5228,,,0.7988,0.946
4,POS_X,-0.3857,,0.5655,-0.1636,-0.4255,-0.3066
5,AAE_TERMS_COUNT,0.2238,,,0.0934,0.1779,
6,LIWC_NEGATE,0.2075,-0.8331,,,,0.1855
7,POS_DET,0.1623,,,,0.3369,0.3268
8,LIWC_ASSENT,-0.1599,,,,-0.2614,-0.1985
9,LIWC_MALE,-0.1505,,-0.3237,,-0.1968,-0.191


In [7]:
df_result.column.values

array(['LIWC_SWEAR', 'LIWC_SEXUAL', 'LIWC_NETSPEAK', 'LIWC_INFORMAL',
       'POS_X', 'AAE_TERMS_COUNT', 'LIWC_NEGATE', 'POS_DET',
       'LIWC_ASSENT', 'LIWC_MALE', 'race', 'LIWC_FILLER'], dtype=object)