In [1]:
import pandas as pd
import sys
import glob
import os

from tqdm import tqdm

sys.path.append("../../../../utils")
from absolute_path_builder import AbsolutePathBuilder

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [3]:
DATASET = "youtube"
# DATASET = "coraal_vs_buckeye"
# DATASET = "twitter"

if DATASET == "twitter":
    pvalue = 0.05
    n_largest = 50
else:
    pvalue = 0.01
    n_largest = 25

input_path = AbsolutePathBuilder.get_path(
    f"06_{DATASET}_regression",
    filepaths="../../../../config/filepaths.yaml"
)

In [4]:
df = None
for SCORE_MODEL in tqdm(
    [
        "perspective",
        "flair",
        "textblob",
        "vader",
        "detoxify_original",
        "detoxify_unbiased",
    ]
):
    df_model = pd.read_csv(os.path.join(input_path, f"{SCORE_MODEL}_score_coefs.csv"))
    
    # Select the most relevant columns according to the p-value and the coefficient
    df_model = (
        df_model[df_model.pvalue < pvalue]
        .sort_values(
            "coef",
            key=lambda item: abs(item),
            ascending=False
        ).iloc[:n_largest]
        [["column", "coef"]]
    )

    df_model = df_model.rename(columns={"coef": f"{SCORE_MODEL}"})
    
    if df is None:
        df = df_model
    else:
        df = df.merge(df_model, on="column", how="outer")

100%|██████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 152.53it/s]


In [5]:
df_result = df.dropna(thresh=4).reset_index(drop=True)

In [6]:
df_result

Unnamed: 0,column,perspective,flair,textblob,vader,detoxify_original,detoxify_unbiased
0,LIWC_SWEAR,1.4529,,,0.2137,2.102,2.1819
1,LIWC_DEATH,1.0922,-3.1559,0.3438,1.8787,1.5901,1.3711
2,LIWC_SEXUAL,0.7733,-2.1333,-0.4023,0.0712,0.7051,0.8716
3,LIWC_BODY,0.6038,-2.415,-1.679,0.0782,0.5397,0.4376
4,LIWC_NEGEMO,0.5267,-3.3423,-1.2651,0.9376,0.2902,0.3222
5,LIWC_ANGER,0.3485,,0.2931,0.3694,0.2662,0.4039
6,LIWC_FILLER,-0.2525,-1.4227,,,-0.3458,-0.2979
7,AAE_TERMS_COUNT,0.2488,,,0.0617,,0.1334
8,LIWC_ASSENT,-0.2394,-0.7194,0.279,-0.0938,-0.3079,-0.2824
9,LIWC_NONFLU,-0.2303,,,-0.0686,-0.3301,-0.2987


In [7]:
df_result.column.values

array(['LIWC_SWEAR', 'LIWC_DEATH', 'LIWC_SEXUAL', 'LIWC_BODY',
       'LIWC_NEGEMO', 'LIWC_ANGER', 'LIWC_FILLER', 'AAE_TERMS_COUNT',
       'LIWC_ASSENT', 'LIWC_NONFLU', 'LIWC_INFORMAL', 'LIWC_SAD',
       'LIWC_INGEST', 'LIWC_RELIG', 'LIWC_NETSPEAK', 'LIWC_SOCIAL',
       'LIWC_WORK', 'LIWC_FEEL', 'LIWC_HEALTH', 'LIWC_FOCUSPAST',
       'LIWC_DISCREP', 'LIWC_AFFILIATION', 'LIWC_NEGATE', 'LIWC_RISK',
       'LIWC_ACHIEV', 'LIWC_POSEMO', 'LIWC_LEISURE', 'LIWC_ANX',
       'LIWC_QUANT', 'LIWC_COGPROC'], dtype=object)