In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import os
import matplotlib.pyplot as plt
import yaml
from liwc import Liwc
import spacy
from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
filepaths = yaml.load(open("/home/luiznery/locus/dissertation/config/filepaths.yaml"))

  filepaths = yaml.load(open("/home/luiznery/locus/dissertation/config/filepaths.yaml"))


In [3]:
import sys
sys.path.append(filepaths['utils'])
import data as data_loader
import liwc_utils
import vocab_tools

In [4]:
import warnings
warnings.filterwarnings("ignore")

#### Dataset

In [27]:
DATASET = 'twitter' # youtube | coraal-buckeye | twitter | interview

FILEPATH_KEY = None
if DATASET == 'youtube':
    FILEPATH_KEY = '05_youtube_features'
elif DATASET == 'coraal-buckeye':
    FILEPATH_KEY = '05_buckeye_corall_features'
elif DATASET == 'twitter':
    FILEPATH_KEY = '05_twitter_features'
else:
    raise Exception('Dataset does not exists')

data = data_loader.load_dataset(DATASET)
data.head(1)

Unnamed: 0,text,perspective_score,flair_score,textblob_score,vader_score,detoxify_original_score,detoxify_unbiased_score,detoxify_multilingual_score,has_swearing,file,group
0,@user wat r u doin boy,0.037538,0.0,0.0,0.0,0.048113,0.014581,0.146005,False,aa_112.csv,aa


#### Race

In [28]:
if DATASET == 'twitter':
    data['race'] = data['group']
else:
    aa_or_wh = lambda x: 'aa' if 'Black' in x else 'wh'
    data['race'] = data.group.apply(aa_or_wh)

#### LIWC VARS

In [29]:
liwc_parser = Liwc(filepaths['liwc_dict'])

data['clean_text'] = data['text'].apply(lambda x: re.sub("[^\w\d'\s]+",'',x))

In [30]:
data['liwc_count'] = data['clean_text'].apply(liwc_utils.liwc_sentence_parse, args=(liwc_parser,)) #aplica do liwc
liwc_count_df = data['liwc_count'].apply(pd.Series)
liwc_count_df.columns = ['liwc_'+col for col in liwc_count_df.columns]
data = pd.concat([data.drop(['liwc_count'], axis=1), liwc_count_df], axis=1)
del liwc_count_df
data = data.fillna(0) ## TODO: verificar
data.head(1)

Unnamed: 0,text,perspective_score,flair_score,textblob_score,vader_score,detoxify_original_score,detoxify_unbiased_score,detoxify_multilingual_score,has_swearing,file,...,liwc_body,liwc_money,liwc_hear,liwc_friend,liwc_family,liwc_motion,liwc_sexual,liwc_we,liwc_relig,liwc_number
0,@user wat r u doin boy,0.037538,0.0,0.0,0.0,0.048113,0.014581,0.146005,False,aa_112.csv,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### AAE Terms Count

In [31]:
terms = [
    # 'bad', 
    'bougie',
    ['busting','out'], 
    'freak', 
    'fresh', 
    'homie', 
    'jones', 
    'mondo',
    'rednecks',
    'bopper',
    'dime',

    'honey',
    ['hot','girl'],
    'ma',
    'shorty',
    'wifey',

    'balla',
    'cat',
    'cuz',
    'dawg',
    'dog',
    'fool',
    'homes',
    'kinfolk',
    'mark',
    'money',
    'player',
    'playa',
    'scrub',
    'slick',

    'benjis',
    'benjamins',
    'benjamin',
    'cabbage',
    'cheese',
    'cream',
    'duckets',
    'franklins',
    'franklin',
    'paper',
    'scrilla',

    'bucks',
    ['dead','presidents'],
    'dime',
    'paper',
    ['cash','money'],
    'dividends',
    'dough',
    'knot',
    'bounce',
    ['push','off'],
    'murk',

    ['playa','hatin'],
    'hatin',
    'hating',
    ['hatin','on'],
    ['balla','blockin'],

    'feel',
    ["we're",'here'],

    ['push','up','on'],
    ['get','wit'],
    ['get','with'],
    ['holler','at','that'],
    'sweatin',

    'sweating',
    ['off','the','hook'],
    ['off','the','chain'],
    'krunk',
    'banging',
    ['too','stupid'],

    'wanna',
    'gotta',
    'finna',
    'bouta',
    'tryna',
    'gonna',
]
len(terms)

75

In [32]:
data['aae_terms_count'] = data.clean_text.str.split(' ').apply( lambda x: vocab_tools.count_terms_in_list(x,terms) )

#### POS

In [33]:
nlp = spacy.load("en_core_web_sm")

In [34]:
pos = data.text.progress_apply(vocab_tools.count_pos_classes, args=(nlp,) )

100%|██████████| 500/500 [00:03<00:00, 145.89it/s]


In [35]:
pos_count_df = pos.apply(pd.Series)
pos_count_df.columns = ['pos_'+col for col in pos_count_df.columns]
pos_count_df = pos_count_df.fillna(0) # TODO: verificar isso
# data = pd.concat([data, pos_count_df], axis=1)
data = pd.concat([data, pos_count_df], axis=1)

### Columns

In [36]:
#LIWC
liwc_cols = [col for col in data.columns if 'liwc_' in col]
print('liwc categories:',len(liwc_parser.categories.keys()))
print('liwc columns:',len(liwc_cols))


liwc categories: 73
liwc columns: 73


In [37]:
# AAE Terms
'aae_terms_count' in data.columns

True

In [38]:
pos_cols = [col for col in data.columns if 'pos_' in col]
print(pos_cols)
print(len(pos_cols))

['pos_ADJ', 'pos_NOUN', 'pos_INTJ', 'pos_ADV', 'pos_VERB', 'pos_ADP', 'pos_DET', 'pos_PUNCT', 'pos_SCONJ', 'pos_PRON', 'pos_AUX', 'pos_PROPN', 'pos_SYM', 'pos_CCONJ', 'pos_PART', 'pos_NUM', 'pos_X', 'pos_SPACE']
18


In [39]:
# race 
'race' in data.columns

True

In [40]:
y_cols = [col for col in data.columns if '_score' in col]
print('Y columns:', y_cols)
print()

features = ['aae_terms_count'] + liwc_cols + pos_cols + ['race']
print('Number of regression features:', len(features))
print()

print('Extra columns:',list(set(list(data.columns)) - set(features) - set(y_cols)))

Y columns: ['perspective_score', 'flair_score', 'textblob_score', 'vader_score', 'detoxify_original_score', 'detoxify_unbiased_score', 'detoxify_multilingual_score']

Number of regression features: 93

Extra columns: ['text', 'file', 'has_swearing', 'group', 'clean_text']


## Saving

In [41]:
data.to_csv(filepaths[FILEPATH_KEY], index=False)