In [1]:
import pandas as pd
import sys
import yaml 
import glob 
import os

from tqdm import tqdm

sys.path.append("../../../utils")
from absolute_path_builder import AbsolutePathBuilder

In [2]:
aae_terms_path = AbsolutePathBuilder.get_path(
    "00_aae_terms",
    filepaths="../../../config/filepaths.yaml"
)

terms = yaml.safe_load(open(aae_terms_path))["terms"]

In [3]:
DATASET = "twitter"

data_path = AbsolutePathBuilder.get_path(
    f"05_{DATASET}_features",
    filepaths="../../../config/filepaths.yaml"
)

### Load data into a single dataframe

In [4]:
dfs = []

filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(data_path, "*"))]
for i, file in enumerate(tqdm(filenames)):
    df = pd.read_csv(os.path.join(data_path, file))
    # Unique index to group information of a single file
    df["FILE_ID"] = i

    dfs.append(df)

100%|██████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 589.78it/s]


In [5]:
df = pd.concat(dfs)

### Calculate the number of AAE terms and files in each race

In [6]:
df["n_words"] = df.text.apply(lambda s: len(s.split()))

In [7]:
df_grouped = (
    df.groupby("race")
    .agg(
        n_sentences=("text", "count"),
        n_words=("n_words", "sum"),
        n_aae_terms=("AAE_TERMS_COUNT", "sum"),
        n_files=("FILE_ID", "nunique")
    )
)

df_grouped["avg_terms_per_file"] = df_grouped.n_aae_terms / df_grouped.n_files
df_grouped["avg_terms_per_sentence"] = df_grouped.n_aae_terms / df_grouped.n_sentences

In [8]:
df_grouped

Unnamed: 0_level_0,n_sentences,n_words,n_aae_terms,n_files,avg_terms_per_file,avg_terms_per_sentence
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Black,250,3051,372,250,1.488,1.488
White,250,3489,259,250,1.036,1.036


### Calculate the number of AAE terms in each file

In [None]:
df_grouped = (
    df.groupby("FILE_ID")
    .agg(
        n_aae_terms=("AAE_TERMS_COUNT", "sum"),
        race=("race", "first")
    )
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.subplots(figsize=(12,7))

sns.distplot(df_grouped.loc[df_grouped.race == "Black", "n_aae_terms"], label="Black")
sns.distplot(df_grouped.loc[df_grouped.race == "White", "n_aae_terms"], label="White")

plt.legend()
plt.title("Distribution of Term Usage")
plt.show()