In [None]:
import numpy as np
import sys
import glob
import os
import pandas as pd
import yaml

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from statsmodels.discrete.discrete_model import Logit
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

sys.path.append("../../../utils")
from absolute_path_builder import AbsolutePathBuilder

In [None]:
np.random.seed(1007)

DATASET = "twitter"

input_path = AbsolutePathBuilder.get_path(
    f"05_{DATASET}_features",
    filepaths="../../../config/filepaths.yaml"
)

output_path = AbsolutePathBuilder.get_path(
    f"06_{DATASET}_regression",
    filepaths="../../../config/filepaths.yaml"
)

cols_to_remove = yaml.safe_load(
    open(
        AbsolutePathBuilder.get_path(
            f"00_cols_to_remove",
            filepaths="../../../config/filepaths.yaml"
        )
    )
)["cols_to_remove"]

In [None]:
dfs = []

filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]
for file in tqdm(filenames):
    dfs.append(pd.read_csv(os.path.join(input_path, file)))

---

In [None]:
for SCORE_MODEL in tqdm(
    [
        "perspective_score",
        "flair_score",
        "textblob_score",
        "vader_score",
        "detoxify_original_score",
        "detoxify_unbiased_score",
        "detoxify_multilingual_score"
    ]
):
    df = pd.concat(dfs).reset_index(drop=True)
    df.drop(columns=cols_to_remove, inplace=True)
    
    liwc_cols = df.columns[df.columns.str.contains("LIWC_")].tolist()
    pos_cols = df.columns[df.columns.str.contains("POS_")].tolist()

    use_cols = liwc_cols + pos_cols + ["AAE_TERMS_COUNT"]

    df["race"] = df.race.apply(lambda race: -1 if race=="White" else 1)

    combined_cols = [f"RACE_VS_{col}" for col in use_cols]
    df[combined_cols] = df[use_cols].multiply(df["race"], axis="index")

    use_cols += combined_cols

    df = df.fillna(0)
    df["intercept"] = 1

    # scaler = MinMaxScaler()
    # df[use_cols] = scaler.fit_transform(df[use_cols])

    use_cols.append("race")
    use_cols.append("intercept")

    model = Logit(df[SCORE_MODEL], df[use_cols]).fit()

    df_coefs = pd.DataFrame(
        list(
            zip(
                model.pvalues.index,
                model.pvalues.values,
                model.params.values
            )
        ),
        columns=["column", "pvalue", "coef"]
    )

    df_coefs = df_coefs.sort_values("coef", key=lambda x: abs(x), ascending=False)

    df_coefs.to_csv(os.path.join(output_path, f"{SCORE_MODEL}_coefs.csv"), index=False)