In [None]:
import numpy as np
import sys
import glob
import os
import pandas as pd

from statsmodels.discrete.discrete_model import Logit
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

sys.path.append("../../../utils")
from absolute_path_builder import AbsolutePathBuilder

In [None]:
np.random.seed(1007)

DATASET = "youtube"
SCORE_MODEL = "perspective_score"

In [None]:
data_path = AbsolutePathBuilder.get_path(
    f"05_{DATASET}_features",
    filepaths="../../../config/filepaths.yaml"
)

---
### Concatenate every intermediate file

In [None]:
dfs = []

filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(data_path, "*"))]
for file in tqdm(filenames):
    dfs.append(pd.read_csv(os.path.join(data_path, file)))
    
df = pd.concat(dfs).reset_index(drop=True)
df = df.fillna(0)

---
### Build a list with the columns to use

In [None]:
liwc_cols = df.columns[df.columns.str.contains("LIWC")].tolist()
pos_cols = df.columns[df.columns.str.contains("POS")].tolist()

use_cols = liwc_cols + pos_cols + ["AAE_TERMS_COUNT"]

In [None]:
df["intercept"] = 1
df["race"] = df.race.apply(lambda race: -1 if race=="White" else 1)

df.race.value_counts()

---
### Rescale data

In [None]:
scaler = MinMaxScaler()
df[use_cols] = scaler.fit_transform(df[use_cols])

In [None]:
use_cols.append("race")
use_cols.append("intercept")

In [None]:
df.describe()

---
### Builds a Logistic Regression model

In [None]:
model = Logit(df[SCORE_MODEL], df[use_cols]).fit()

In [None]:
df_coefs = pd.DataFrame(
    list(
        zip(
            model.pvalues.index,
            model.pvalues.values,
            model.params.values
        )
    ),
    columns=["column", "pvalue", "coef"]
)

df_coefs = df_coefs.sort_values("coef", key=lambda x: abs(x), ascending=False)
df_coefs[df_coefs.pvalue < 0.05]