### Quick Description

This notebook calculates the LIWC categories for each sentence within a document. We count the frequency of each category in a sentence, and save the count vector as features.

In case of doubt, read [LIWC 2007 Manual](https://www.liwc.net/LIWC2007LanguageManual.pdf) and/or [LIWC 2015 Manual](https://repositories.lib.utexas.edu/bitstream/handle/2152/31333/LIWC2015_LanguageManual.pdf]).

In [1]:
import sys
import liwc
import glob
import os
import pandas as pd
import yaml

from collections import Counter
from tqdm import tqdm

sys.path.append("../../../utils")
from absolute_path_builder import AbsolutePathBuilder

In [2]:
DATASET = "twitter"

data_path = AbsolutePathBuilder.get_path(
    f"05_{DATASET}_features",
    filepaths="../../../config/filepaths.yaml"
)

liwc_path = AbsolutePathBuilder.get_path(
    "00_liwc_dict",
    filepaths="../../../config/filepaths.yaml"
)

cols_to_remove = yaml.safe_load(
    open(
        AbsolutePathBuilder.get_path(
            f"00_cols_to_remove",
            filepaths="../../../config/filepaths.yaml"
        )
    )
)["cols_to_remove"]

In [3]:
def calculate_liwc_features(input_path, liwc_path, cols_to_remove):
    parser, category_names = liwc.load_token_parser(liwc_path)
    
    filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(input_path, "*"))]
    for file in tqdm(filenames):
        df = pd.read_csv(os.path.join(input_path, file))
        df["text_wo_punctuation"] = df.text.str.replace(r"[^\w\d'\s]+", '', regex=True)

        df_liwc = (
            pd.DataFrame(
                df.text.apply(
                    lambda s: Counter([category for token in s.split(' ') for category in parser(token)])
                ).values
                .tolist()
            ).fillna(0)
        )

        df_liwc.columns = [f"LIWC_{col.upper()}" for col in df_liwc.columns]
        cols_to_remove = set(cols_to_remove).intersection(df_liwc.columns)
        df_liwc.drop(columns=cols_to_remove, inplace=True)

        df_liwc = df_liwc.div(df_liwc.sum(axis=1), axis=0)
        
        df = pd.concat([df, df_liwc], axis=1)
        df.drop(columns=["text_wo_punctuation"], inplace=True)

        df.to_csv(os.path.join(input_path, file), index=False)

In [4]:
calculate_liwc_features(data_path, liwc_path, cols_to_remove)

100%|██████████████████████████████████████████████████████████████| 500/500 [00:01<00:00, 330.65it/s]
