In [1]:
import numpy as np
import sys
import glob
import os
import pandas as pd

from statsmodels.discrete.discrete_model import Logit
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

sys.path.append("../../../utils")
from absolute_path_builder import AbsolutePathBuilder

Models:
 - perspective_score
 - flair_score
 - textblob_score
 - vader_score
 - detoxify_original_score
 - detoxify_unbiased_score
 - detoxify_multilingual_score

In [2]:
np.random.seed(1007)

DATASET = "twitter"
SCORE_MODEL = "detoxify_multilingual_score"

input_path = AbsolutePathBuilder.get_path(
    f"05_{DATASET}_features",
    filepaths="../../../config/filepaths.yaml"
)

output_path = AbsolutePathBuilder.get_path(
    f"06_{DATASET}_regression",
    filepaths="../../../config/filepaths.yaml"
)

---
### Concatenate every intermediate file
**Dataset**

In [3]:
dfs = []

filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]
for file in tqdm(filenames):
    dfs.append(pd.read_csv(os.path.join(input_path, file)))
    
df = pd.concat(dfs).reset_index(drop=True)

100%|██████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 884.12it/s]


---
### Build a list with the columns to use

In [4]:
liwc_cols = df.columns[df.columns.str.contains("LIWC_")].tolist()
pos_cols = df.columns[df.columns.str.contains("POS_")].tolist()

use_cols = liwc_cols + pos_cols + ["AAE_TERMS_COUNT"]

In [5]:
df["race"] = df.race.apply(lambda race: -1 if race=="White" else 1)

df.race.value_counts()

 1    250
-1    250
Name: race, dtype: int64

In [6]:
combined_cols = [f"RACE_VS_{col}" for col in use_cols]
df[combined_cols] = df[use_cols].multiply(df["race"], axis="index")

use_cols += combined_cols

In [7]:
df = df.fillna(0)

In [8]:
df["intercept"] = 1

---
### Rescale data

In [9]:
scaler = MinMaxScaler()
df[use_cols] = scaler.fit_transform(df[use_cols])

In [10]:
use_cols.append("race")
use_cols.append("intercept")

In [11]:
df.describe()

Unnamed: 0,perspective_score,flair_score,textblob_score,vader_score,detoxify_original_score,detoxify_unbiased_score,detoxify_multilingual_score,POS_DET,POS_NOUN,POS_PROPN,...,RACE_VS_POS_PUNCT,RACE_VS_POS_SYM,RACE_VS_POS_SCONJ,RACE_VS_POS_X,RACE_VS_POS_NUM,RACE_VS_POS_CCONJ,RACE_VS_POS_SPACE,RACE_VS_POS_INTJ,RACE_VS_AAE_TERMS_COUNT,intercept
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.256115,0.521471,0.075709,0.101636,0.248371,0.231455,0.226906,0.182,0.253778,0.133667,...,0.390435,0.4785,0.5924,0.636364,0.491333,0.564571,0.251,0.498,0.511,1.0
std,0.270612,0.473903,0.188765,0.146928,0.369404,0.371412,0.372891,0.217303,0.182328,0.194187,...,0.100418,0.104569,0.119544,0.056975,0.083299,0.099248,0.06129,0.108488,0.098226,0.0
min,0.0,0.0,0.0,0.0,0.000524,0.000365,0.000188,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.047753,0.0,0.0,0.0,0.001908,0.001316,0.000816,0.0,0.111111,0.0,...,0.347826,0.5,0.6,0.636364,0.5,0.571429,0.25,0.5,0.5,1.0
50%,0.148221,0.780417,0.0,0.0,0.017683,0.010144,0.004699,0.25,0.222222,0.0,...,0.391304,0.5,0.6,0.636364,0.5,0.571429,0.25,0.5,0.5,1.0
75%,0.389726,0.994382,0.0,0.19025,0.431326,0.283278,0.349252,0.25,0.333333,0.166667,...,0.434783,0.5,0.6,0.636364,0.5,0.571429,0.25,0.5,0.5,1.0
max,0.985082,0.999995,1.0,0.838,0.998765,0.997513,0.998888,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


---
### Builds a Logistic Regression model

In [12]:
model = Logit(df[SCORE_MODEL], df[use_cols]).fit()

Optimization terminated successfully.
         Current function value: 0.102837
         Iterations 14


In [13]:
df_coefs = pd.DataFrame(
    list(
        zip(
            model.pvalues.index,
            model.pvalues.values,
            model.params.values
        )
    ),
    columns=["column", "pvalue", "coef"]
)

df_coefs = df_coefs.sort_values("coef", key=lambda x: abs(x), ascending=False)

In [14]:
df_coefs.to_csv(os.path.join(output_path, f"{SCORE_MODEL}_coefs.csv"), index=False)