In [1]:
import numpy as np
import sys
import glob
import os
import pandas as pd

from statsmodels.discrete.discrete_model import Logit
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

sys.path.append("../../../utils")
from absolute_path_builder import AbsolutePathBuilder

Models:
 - perspective_score
 - flair_score
 - textblob_score
 - vader_score
 - detoxify_original_score
 - detoxify_unbiased_score
 - detoxify_multilingual_score

In [2]:
np.random.seed(1007)
SCORE_MODEL = "detoxify_multilingual_score"

coraal_path = AbsolutePathBuilder.get_path(
    "05_coraal_features",
    filepaths="../../../config/filepaths.yaml"
)

buckeye_path = AbsolutePathBuilder.get_path(
    "05_buckeye_features",
    filepaths="../../../config/filepaths.yaml"
)

output_path = AbsolutePathBuilder.get_path(
    f"06_coraal_vs_buckeye_regression",
    filepaths="../../../config/filepaths.yaml"
)

---
### Concatenate every intermediate file
**CORAAL vs. Buckeye**

In [3]:
dfs = []

filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(coraal_path, "*"))]
for file in tqdm(filenames):
    dfs.append(pd.read_csv(os.path.join(coraal_path, file)))
    
filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(buckeye_path, "*"))]
for file in tqdm(filenames):
    dfs.append(pd.read_csv(os.path.join(buckeye_path, file)))
    
df = pd.concat(dfs).reset_index(drop=True)

100%|██████████████████████████████████████████████████████████████| 142/142 [00:00<00:00, 240.40it/s]
100%|████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 220.10it/s]


---
### Build a list with the columns to use

In [4]:
liwc_cols = df.columns[df.columns.str.contains("LIWC_")].tolist()
pos_cols = df.columns[df.columns.str.contains("POS_")].tolist()

use_cols = liwc_cols + pos_cols + ["AAE_TERMS_COUNT"]

In [5]:
df["race"] = df.race.apply(lambda race: -1 if race=="White" else 1)

df.race.value_counts()

 1    64493
-1    19304
Name: race, dtype: int64

In [6]:
combined_cols = [f"RACE_VS_{col}" for col in use_cols]
df[combined_cols] = df[use_cols].multiply(df["race"], axis="index")

use_cols += combined_cols

In [7]:
df = df.fillna(0)

In [8]:
df["intercept"] = 1

---
### Rescale data

In [9]:
scaler = MinMaxScaler()
df[use_cols] = scaler.fit_transform(df[use_cols])

In [10]:
use_cols.append("race")
use_cols.append("intercept")

In [11]:
df.describe()

Unnamed: 0,perspective_score,flair_score,textblob_score,vader_score,detoxify_original_score,detoxify_unbiased_score,detoxify_multilingual_score,POS_NOUN,POS_PROPN,POS_PUNCT,...,RACE_VS_POS_ADV,RACE_VS_POS_ADJ,RACE_VS_POS_PART,RACE_VS_POS_CCONJ,RACE_VS_POS_SCONJ,RACE_VS_POS_X,RACE_VS_POS_SYM,RACE_VS_POS_SPACE,RACE_VS_AAE_TERMS_COUNT,intercept
count,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,...,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0,83797.0
mean,0.090938,0.415459,0.034443,0.036336,0.039442,0.034809,0.0368,0.055398,0.00951,0.036985,...,0.498503,0.509775,0.449011,0.523871,0.509679,0.669216,0.500006,2.4e-05,0.603434,1.0
std,0.128353,0.460111,0.110672,0.103496,0.150772,0.149402,0.152243,0.070967,0.031105,0.055167,...,0.044938,0.041211,0.037767,0.039959,0.044191,0.023262,0.00457,0.004885,0.022176,0.0
min,0.0,0.0,0.0,0.0,0.000506,0.000286,0.000162,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.026722,0.0,0.0,0.0,0.000822,0.000485,0.000545,0.0,0.0,0.0,...,0.487179,0.5,0.44,0.515152,0.5,0.666667,0.5,0.0,0.6,1.0
50%,0.044405,0.0,0.0,0.0,0.001268,0.000753,0.000866,0.03125,0.0,0.026316,...,0.487179,0.5,0.44,0.515152,0.5,0.666667,0.5,0.0,0.6,1.0
75%,0.094948,0.969123,0.0,0.0,0.005014,0.002369,0.0027,0.0625,0.0,0.052632,...,0.512821,0.535714,0.44,0.545455,0.5,0.666667,0.5,0.0,0.6,1.0
max,0.986308,0.999999,1.0,1.0,0.999107,0.998033,0.999007,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


---
### Builds a Logistic Regression model

In [12]:
model = Logit(df[SCORE_MODEL], df[use_cols]).fit()

Optimization terminated successfully.
         Current function value: 0.051178
         Iterations 11


In [13]:
df_coefs = pd.DataFrame(
    list(
        zip(
            model.pvalues.index,
            model.pvalues.values,
            model.params.values
        )
    ),
    columns=["column", "pvalue", "coef"]
)

df_coefs = df_coefs.sort_values("coef", key=lambda x: abs(x), ascending=False)

In [14]:
df_coefs.to_csv(os.path.join(output_path, f"{SCORE_MODEL}_coefs.csv"), index=False)