In [1]:
import numpy as np
import sys
import glob
import os
import pandas as pd
import warnings
import yaml

from statsmodels.regression.linear_model import GLS
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

sys.path.append("../../../utils")
from absolute_path_builder import AbsolutePathBuilder

In [2]:
# Disable this for any new modification to make sure everything is ok
warnings.filterwarnings("ignore")

In [3]:
np.random.seed(1007)

coraal_path = AbsolutePathBuilder.get_path(
    "05_coraal_features",
    filepaths="../../../config/filepaths.yaml"
)

buckeye_path = AbsolutePathBuilder.get_path(
    "05_buckeye_features",
    filepaths="../../../config/filepaths.yaml"
)

output_path = AbsolutePathBuilder.get_path(
    f"06_coraal_vs_buckeye_regression",
    filepaths="../../../config/filepaths.yaml"
)

In [4]:
dfs = []

filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(coraal_path, "*"))]
for file in tqdm(filenames):
    dfs.append(pd.read_csv(os.path.join(coraal_path, file)))
    
filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(buckeye_path, "*"))]
for file in tqdm(filenames):
    dfs.append(pd.read_csv(os.path.join(buckeye_path, file)))

100%|██████████████████████████████████████████████████████████████| 142/142 [00:00<00:00, 208.29it/s]
100%|████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 200.18it/s]


---

In [5]:
for SCORE_MODEL in tqdm(
    [
        "perspective_score",
        "flair_score",
        "textblob_score",
        "vader_score",
        "detoxify_original_score",
        "detoxify_unbiased_score",
        "detoxify_multilingual_score"
    ]
):
    df = pd.concat(dfs).reset_index(drop=True)

    liwc_cols = df.columns[df.columns.str.contains("LIWC_")].tolist()
    pos_cols = df.columns[df.columns.str.contains("POS_")].tolist()

    use_cols = liwc_cols + pos_cols + ["AAE_TERMS_COUNT"]
    
    df["race"] = df.race.apply(lambda race: 0 if race=="White" else 1)

    df = df.fillna(0)
    df["intercept"] = 1

    scaler = MinMaxScaler()
    df[use_cols] = scaler.fit_transform(df[use_cols])

    use_cols.append("race")
    use_cols.append("intercept")

    model = GLS(df[SCORE_MODEL], df[use_cols]).fit()

    df_coefs = pd.DataFrame(
        list(
            zip(
                model.pvalues.index,
                model.pvalues.values,
                model.params.values
            )
        ),
        columns=["column", "pvalue", "coef"]
    )

    df_coefs = df_coefs.sort_values("coef", key=lambda x: abs(x), ascending=False)

    df_coefs.to_csv(os.path.join(output_path, f"{SCORE_MODEL}_coefs.csv"), index=False)

100%|███████████████████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.14s/it]
