# Imports, styles, etc

In [4]:
import numpy as np
import pandas as pd
from liwc import Liwc
from matplotlib import pyplot as plt
import seaborn as sns
import json
from tqdm import tqdm
import scipy

import scipy.stats as stats

import sys
sys.path.append('../utils/')
from data import *
from liwc import *
from analisys import *

In [5]:
plt.rcParams['axes.labelsize']  = 10 # fontes no tamanho usado no template
plt.rcParams['axes.titlesize']  = 10
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['font.family'] = 'serif'
# plt.rcParams['text.usetex'] = True
plt.rcParams['lines.linewidth'] = 2

In [6]:
plt.style.use('tableau-colorblind10') # use um estilo colorblind!
plt.ion()

---

# Reading and processing data

In [7]:
def liwc_sentence_parse(s,liwc):
    return liwc.parse(s.split(' '))

def get_key_counts(counter,key):
    if key in counter:
        return counter[key]
    else:
        return 0

LIWC_FILEPATH = '../data/LIWC2015_English_Flat.dic'
liwc = Liwc(LIWC_FILEPATH)

df = get_movies_dataset()
df['liwc_count'] = df['text_clean'].apply(liwc_sentence_parse, args=(liwc,))
df['bad_words_count'] = df['tags_count'].apply(get_key_counts,args=(' __ ',))
df = df[df.bad_words_count==0]
df = df.drop(columns = ['tags_count','bad_words_count'])

In [8]:
scores = pd.DataFrame()
for file in os.listdir('../data/03_scored/with_curse_words'):
    tmp = pd.read_csv('../data/03_scored/with_curse_words/'+file)
    tmp['id']=file
    scores = scores.append(tmp, ignore_index=True)
scores['len'] = scores.text.str.split(' ').str.len()

total_score = (scores.groupby('id')['score'].sum()/scores.groupby('id')['len'].sum()).sort_values()[::-1]

FileNotFoundError: [Errno 2] No such file or directory: '../data/03_scored/with_curse_words'

In [None]:
df = df.set_index('id')
df['score'] = total_score
df = df.reset_index()
df = df[df.score.notna()]

In [None]:
d = get_movies_dataset()
d['bad_words_count'] = d['tags_count'].apply(get_key_counts,args=(' __ ',))
d

In [None]:
d[d.bad_words_count==0].groupby('group').count()

---

# Toxic phrases exemples

In [None]:
phrases = scores.merge(df[['movie','age','group','id']],on='id')

In [None]:
phrases.iloc[[31463,30591,38070,34814]][['text','score','movie','group']]

In [None]:
# severe_toxicity

In [None]:
# print(scores.iloc[[31463,30591,38070,34814]][['text','score','movie','group']].to_latex(index=False,))

---

# Processing data with LIWC

In [None]:
#LIWC data

LIWC_FILEPATH = '../data/LIWC2015_English_Flat.dic'
liwc = Liwc(LIWC_FILEPATH)

liwc_df = pd.DataFrame().from_records(df['liwc_count'],index=df['id'])
liwc_df = liwc_df.fillna(0)

#normalizing 
#Normalizando pelo numero de categorias obtidas ????????????????? qual a melhor forma de fazer isso? talvez pelo numero de palavras
liwc_df_norm = (liwc_df[liwc.categories.values()].T * ( 1 / liwc_df[liwc.categories.values()].sum(axis=1) )).T

liwc_df_norm.reset_index(inplace=True)
liwc_df_norm['group'] = df['group']
liwc_df_norm['age'] = df['age']

---

## Kruscal-Wallis Test

In [None]:
liwc_df_rank = liwc_df_norm.copy()
for col in liwc.categories.values():
    liwc_df_rank[col] = liwc_df_rank[col].rank()

In [None]:
experiments = []
old_experiments = []

for feature in liwc.categories.values():
    
    result = stats.kruskal(
        liwc_df_rank[liwc_df_rank.group=='White Man'][feature],
        liwc_df_rank[liwc_df_rank.group=='White Woman'][feature],
        liwc_df_rank[liwc_df_rank.group=='Black Man'][feature],
        liwc_df_rank[liwc_df_rank.group=='Black Woman'][feature],
    )
    experiments.append({
        'feature': feature,
        'statistic': result[0] ,
        'pvalue': result[1]
    })
experiments = pd.DataFrame(experiments)

## Relevant features

https://www.liwc.net/LIWC2007LanguageManual.pdf

https://repositories.lib.utexas.edu/bitstream/handle/2152/31333/LIWC2015_LanguageManual.pdf

In [None]:
experiments[experiments['pvalue']<0.05].sort_values('pvalue')

In [None]:
sns.displot(liwc_df_rank, x="cogproc", hue="group", kind="ecdf",complementary=True) #dicionario de palavroes do liwc
plt.savefig('cogproc.pdf')

## Black people use more informalities

In [None]:
sns.displot(liwc_df_rank, x="informal", hue="group", kind="ecdf",complementary=True) #dicionario de palavroes do liwc
plt.savefig('informal.pdf')

In [None]:
sns.displot(liwc_df_rank, x="netspeak", hue="group", kind="ecdf",complementary=True, legend=False) #dicionario de palavroes do liwc
plt.savefig('netspeak.pdf')

In [None]:
sns.displot(liwc_df_rank, x="assent", hue="group", kind="ecdf",complementary=True, legend=False) #
plt.savefig('assent.pdf')

In [None]:
sns.displot(liwc_df_rank, x="filler", hue="group", kind="ecdf",complementary=True, legend=False) #
plt.savefig('filler.pdf')

## Others stats 

### swear - fuck, damn, shit

In [None]:
sns.displot(liwc_df_rank, x="swear", hue="group", kind="ecdf",complementary=True, legend=False) #dicionario de palavroes do liwc
plt.savefig('swear.pdf')

### prep - prepositions - to, with, above 

In [None]:
sns.displot(liwc_df_rank, x="prep", hue="group", kind="ecdf",complementary=True) #dicionario de palavroes do liwc
plt.savefig('prep.pdf')

### anx - worried, fearful

In [None]:
sns.displot(liwc_df_rank, x="anx", hue="group", kind="ecdf",complementary=True) #dicionario de palavroes do liwc

### percept - Perceptual processes - look, heard, feeling 

In [9]:
sns.displot(liwc_df_rank, x="percept", hue="group", kind="ecdf",complementary=True) #dicionario de palavroes do liwc

NameError: name 'liwc_df_rank' is not defined

### conj - Conjunctions - and, but, whereas

In [10]:
sns.displot(liwc_df_rank, x="conj", hue="group", kind="ecdf",complementary=True, legend=False) #dicionario de palavroes do liwc
plt.savefig('conj.pdf')

NameError: name 'liwc_df_rank' is not defined

### function - Total function words - it, to, no, very

In [11]:
sns.displot(liwc_df_rank, x="function", hue="group", kind="ecdf",complementary=True, legend=False) #dicionario de palavroes do liwc
plt.savefig('function.pdf')

NameError: name 'liwc_df_rank' is not defined

---
# radar plot

In [None]:
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections.polar import PolarAxes
from matplotlib.projections import register_projection
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D


def radar_factory(num_vars, frame='circle'):
    """
    Create a radar chart with `num_vars` axes.

    This function creates a RadarAxes projection and registers it.

    Parameters
    ----------
    num_vars : int
        Number of variables for radar chart.
    frame : {'circle', 'polygon'}
        Shape of frame surrounding axes.

    """
    # calculate evenly-spaced axis angles
    theta = np.linspace(0, 2*np.pi, num_vars, endpoint=False)

    class RadarAxes(PolarAxes):

        name = 'radar'
        # use 1 line segment to connect specified points
        RESOLUTION = 1

        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            # rotate plot such that the first axis is at the top
            self.set_theta_zero_location('N')

        def fill(self, *args, closed=True, **kwargs):
            """Override fill so that line is closed by default"""
            return super().fill(closed=closed, *args, **kwargs)

        def plot(self, *args, **kwargs):
            """Override plot so that line is closed by default"""
            lines = super().plot(*args, **kwargs)
            for line in lines:
                self._close_line(line)

        def _close_line(self, line):
            x, y = line.get_data()
            # FIXME: markers at x[0], y[0] get doubled-up
            if x[0] != x[-1]:
                x = np.append(x, x[0])
                y = np.append(y, y[0])
                line.set_data(x, y)

        def set_varlabels(self, labels):
            self.set_thetagrids(np.degrees(theta), labels)

        def _gen_axes_patch(self):
            # The Axes patch must be centered at (0.5, 0.5) and of radius 0.5
            # in axes coordinates.
            if frame == 'circle':
                return Circle((0.5, 0.5), 0.5)
            elif frame == 'polygon':
                return RegularPolygon((0.5, 0.5), num_vars,
                                      radius=.5, edgecolor="k")
            else:
                raise ValueError("Unknown value for 'frame': %s" % frame)

        def _gen_axes_spines(self):
            if frame == 'circle':
                return super()._gen_axes_spines()
            elif frame == 'polygon':
                # spine_type must be 'left'/'right'/'top'/'bottom'/'circle'.
                spine = Spine(axes=self,
                              spine_type='circle',
                              path=Path.unit_regular_polygon(num_vars))
                # unit_regular_polygon gives a polygon of radius 1 centered at
                # (0, 0) but we want a polygon of radius 0.5 centered at (0.5,
                # 0.5) in axes coordinates.
                spine.set_transform(Affine2D().scale(.5).translate(.5, .5)
                                    + self.transAxes)
                return {'polar': spine}
            else:
                raise ValueError("Unknown value for 'frame': %s" % frame)

    register_projection(RadarAxes)
    return theta


def example_data():
    # The following data is from the Denver Aerosol Sources and Health study.
    # See doi:10.1016/j.atmosenv.2008.12.017
    #
    # The data are pollution source profile estimates for five modeled
    # pollution sources (e.g., cars, wood-burning, etc) that emit 7-9 chemical
    # species. The radar charts are experimented with here to see if we can
    # nicely visualize how the modeled source profiles change across four
    # scenarios:
    #  1) No gas-phase species present, just seven particulate counts on
    #     Sulfate
    #     Nitrate
    #     Elemental Carbon (EC)
    #     Organic Carbon fraction 1 (OC)
    #     Organic Carbon fraction 2 (OC2)
    #     Organic Carbon fraction 3 (OC3)
    #     Pyrolized Organic Carbon (OP)
    #  2)Inclusion of gas-phase specie carbon monoxide (CO)
    #  3)Inclusion of gas-phase specie ozone (O3).
    #  4)Inclusion of both gas-phase species is present...
    data = [
        ['Sulfate', 'Nitrate', 'EC', 'OC1', 'OC2', 'OC3', 'OP', 'CO', 'O3'],
        ('Basecase', [
            [0.88, 0.01, 0.03, 0.03, 0.00, 0.06, 0.01, 0.00, 0.00],
            [0.07, 0.95, 0.04, 0.05, 0.00, 0.02, 0.01, 0.00, 0.00],
            [0.01, 0.02, 0.85, 0.19, 0.05, 0.10, 0.00, 0.00, 0.00],
            [0.02, 0.01, 0.07, 0.01, 0.21, 0.12, 0.98, 0.00, 0.00],
            [0.01, 0.01, 0.02, 0.71, 0.74, 0.70, 0.00, 0.00, 0.00]]),
        ('With CO', [
            [0.88, 0.02, 0.02, 0.02, 0.00, 0.05, 0.00, 0.05, 0.00],
            [0.08, 0.94, 0.04, 0.02, 0.00, 0.01, 0.12, 0.04, 0.00],
            [0.01, 0.01, 0.79, 0.10, 0.00, 0.05, 0.00, 0.31, 0.00],
            [0.00, 0.02, 0.03, 0.38, 0.31, 0.31, 0.00, 0.59, 0.00],
            [0.02, 0.02, 0.11, 0.47, 0.69, 0.58, 0.88, 0.00, 0.00]]),
        ('With O3', [
            [0.89, 0.01, 0.07, 0.00, 0.00, 0.05, 0.00, 0.00, 0.03],
            [0.07, 0.95, 0.05, 0.04, 0.00, 0.02, 0.12, 0.00, 0.00],
            [0.01, 0.02, 0.86, 0.27, 0.16, 0.19, 0.00, 0.00, 0.00],
            [0.01, 0.03, 0.00, 0.32, 0.29, 0.27, 0.00, 0.00, 0.95],
            [0.02, 0.00, 0.03, 0.37, 0.56, 0.47, 0.87, 0.00, 0.00]]),
        ('CO & O3', [
            [0.87, 0.01, 0.08, 0.00, 0.00, 0.04, 0.00, 0.00, 0.01],
            [0.09, 0.95, 0.02, 0.03, 0.00, 0.01, 0.13, 0.06, 0.00],
            [0.01, 0.02, 0.71, 0.24, 0.13, 0.16, 0.00, 0.50, 0.00],
            [0.01, 0.03, 0.00, 0.28, 0.24, 0.23, 0.00, 0.44, 0.88],
            [0.02, 0.00, 0.18, 0.45, 0.64, 0.55, 0.86, 0.00, 0.16]])
    ]
    return data



N = 9
theta = radar_factory(N, frame='polygon')

data = example_data()
spoke_labels = data.pop(0)

fig, ax = plt.subplots(figsize=(9, 9), nrows=1, ncols=1,
                        subplot_kw=dict(projection='radar'))
fig.subplots_adjust(wspace=0.25, hspace=0.20, top=0.85, bottom=0.05)

colors = ['b', 'r', 'g', 'm', 'y']
# Plot the four cases from the example data on separate axes

ax.set_rgrids([0.2, 0.4, 0.6, 0.8])
ax.set_title(title, weight='bold', size='medium', position=(0.5, 1.1),
             horizontalalignment='center', verticalalignment='center')
for d, color in zip(case_data, colors):
    ax.plot(theta, d, color=color)
    ax.fill(theta, d, facecolor=color, alpha=0.25)
ax.set_varlabels(spoke_labels)

# add legend relative to top-left plot
labels = ('Factor 1', 'Factor 2', 'Factor 3', 'Factor 4', 'Factor 5')
legend = ax.legend(labels, loc=(0.9, .95),
                          labelspacing=0.1, fontsize='small')

fig.text(0.5, 0.965, '5-Factor Solution Profiles Across Four Scenarios',
         horizontalalignment='center', color='black', weight='bold',
         size='large')

plt.show()


---

# Diferences between groups on toxicity score

### Kruskal-wallis test

In [12]:
result = stats.kruskal(
        df[df.group=='White Man']['score'],
        df[df.group=='White Woman']['score'],
        df[df.group=='Black Man']['score'],
        df[df.group=='Black Woman']['score'],
    )
result

KeyError: 'score'

In [None]:
sns.displot(df, x="score", hue="group", kind="ecdf",complementary=True) #checar o que sao adjetivos

---

# Bad Words

In [13]:
data = json.load(open('../data/bad_words_scored.json'))
word_scores = pd.DataFrame({'words':data.keys(),'score': data.values()})
bad_words = word_scores[word_scores.score>0.2]

In [14]:
def get_word_count(text, word):
    text_s = pd.Series(text.split(' '))
    return (text_s==word).sum()

def get_bad_words_count(text, bad_words):
    total=0
    for bad_word in bad_words:
        total +=get_word_count(text,bad_word)
    return total

In [15]:
df['bad_words_count'] = df['text_clean'].apply(get_bad_words_count,args=(bad_words.words,)) 

In [16]:
display(sns.lmplot(col="group", hue="group", data=df,x='bad_words_count',y='score'))
display(sns.lmplot(data=df,x='bad_words_count',y='score'))

KeyError: "['score'] not in index"

## Person correlation

In [None]:
print('For all data:', scipy.stats.pearsonr(df.bad_words_count,df.score))

for group in df.group.unique():
    print('For {}:'.format(group), scipy.stats.pearsonr(df[df.group==group].bad_words_count, df[df.group==group].score))

## Spearman correlation

In [None]:
print('For all data:', scipy.stats.spearmanr(df.bad_words_count,df.score))

for group in df.group.unique():
    print('For {}:'.format(group), scipy.stats.spearmanr(df[df.group==group].bad_words_count, df[df.group==group].score))

### Kruskal-Wallis on bad words count

In [None]:
result = stats.kruskal(
        df[df.group=='White Man']['bad_words_count'],
        df[df.group=='White Woman']['bad_words_count'],
        df[df.group=='Black Man']['bad_words_count'],
        df[df.group=='Black Woman']['bad_words_count'],
    )
result

In [None]:
sns.displot(df, x="bad_words_count", hue="group", kind="ecdf",complementary=True) #checar o que sao adjetivos

---

# Words relation - bad words

In [None]:
bad_words = word_scores[word_scores.score>0.2]

bad_words = bad_words.set_index('words')
bad_words['White Man'] = np.nan
bad_words['Black Man'] = np.nan
bad_words['White Woman'] = np.nan
bad_words['Black Woman'] = np.nan

for bad_word in tqdm(bad_words.index):
    for group in df.group.unique():
        total = 0
        for sent in df[df.group==group].text_clean:
             total += get_word_count(sent,bad_word)
        bad_words.loc[bad_word,group] = total
            
# bad_words = bad_words.reset_index()

In [None]:
display(bad_words['White Man'].sort_values(ascending=False).head(20))
display(bad_words['White Woman'].sort_values(ascending=False).head(20))
display(bad_words['Black Man'].sort_values(ascending=False).head(20))
display(bad_words['Black Woman'].sort_values(ascending=False).head(20))

### Word: dope - Black Man

In [None]:
before,after = get_word_relations('dope', df[df['group']=="Black Man"].text_clean)

In [None]:
display(before,after)

### Word: damn - Black Woman

In [None]:
a,b = get_word_relations('damn', df[df['group']=="Black Woman"].text_clean)

In [None]:
display(a,b)

### Word: gay - Black Woman

In [None]:
a,b = get_word_relations('gay', df[df['group']=="Black Woman"].text_clean)

In [None]:
display(a,b)

### Word: extra

In [None]:
a,b = get_word_relations('hella', df.text_clean)

In [None]:
display(a,b)

---

# Words relation - informal - LIWC

In [None]:
informal = pd.DataFrame()
informal['White Man'] = np.nan
informal['Black Man'] = np.nan
informal['White Woman'] = np.nan
informal['Black Woman'] = np.nan

In [None]:
for i,data in df.iterrows():
    for word in data.text_clean.split(' '):
        liwc_val = liwc.parse([word])['informal']
        if liwc_val>0:
            if word not in informal.index:
                informal.loc[word,data.group] = liwc_val
            else:
                informal.loc[word,data.group] += liwc_val
informal = informal.fillna(0)

In [None]:
informal.idxmax()

In [None]:
#We need to computete each informal word toxicity value to compare who much toxicity is added by this words

---

# Looking for correlations in the data

### Top k = 20

In [None]:
topk = df.sort_values('score',ascending = False).groupby('group').head(20)

In [None]:
_vars = ['swear','social','informal','netspeak','anx','adj','quant','filler']

for var in _vars:
    topk[var] = topk.liwc_count.apply(get_key_counts,args=(var,))

In [None]:
sns.pairplot(topk[_vars+['score','group']], hue='group')

### Person correlation

In [None]:
print('For all data:', scipy.stats.spearmanr(topk.bad_words_count,topk.score))

for group in df.group.unique():
    print('For {}:'.format(group), scipy.stats.spearmanr(topk[topk.group==group].bad_words_count, topk[topk.group==group].score))

In [None]:
bad_words = word_scores[word_scores.score>0.2]

bad_words = bad_words.set_index('words')
bad_words['White Man'] = np.nan
bad_words['Black Man'] = np.nan
bad_words['White Woman'] = np.nan
bad_words['Black Woman'] = np.nan

for bad_word in tqdm(bad_words.index):
    for group in df.group.unique():
        total = 0
        for sent in df[df.group==group].text_clean:
             total += get_word_count(sent,bad_word)
        bad_words.loc[bad_word,group] = total
            

In [None]:
data = json.load(open('../data/bad_words_scored.json'))
word_scores = pd.DataFrame({'words':data.keys(),'score': data.values()})
# bad_words = word_scores[word_scores.score>0.2]

In [None]:
word_scores = []

In [None]:
l = []
df['has_bad_word'] = False
for word in tqdm(word_scores[word_scores.score>0.8].words):
    df['has_bad_word'] |= df.text_clean.str.contains(word)
    print(word, df.text_clean.str.contains(word).sum())
    

In [None]:
df.groupby('group').sum()

In [None]:
df.groupby('group').count()

### Word: gay - Black Woman

In [None]:
a,b = get_word_relations('gay', df[df['group']=="Black Woman"].text_clean)

In [None]:
display(a,b)

In [None]:
display(get_word_relations('ass',topk[topk.group=='Black Woman'].text_clean))
print()
display(get_word_relations('ass',topk[topk.group=='White Man'].text_clean))

In [None]:
display(get_word_relations('black',topk[topk.group=='Black Woman'].text_clean))
print()
display(get_word_relations('black',topk[topk.group=='White Man'].text_clean))

---

In [None]:
var = 'informal'
topk[var] = topk.liwc_count.apply(get_key_counts,args=(var,))
display(sns.lmplot(col="group", hue="group", data=df,x=var,y='score'))

---

In [None]:
var = 'swear'
topk[var] = topk.liwc_count.apply(get_key_counts,args=(var,))
display(sns.lmplot(col="group", hue="group", data=df,x=var,y='score'))

df[var] = df.liwc_count.apply(get_key_counts,args=(var,))
display(sns.lmplot(col="group", hue="group", data=df,x=var,y='score'))

---

In [None]:
experiments[experiments['pvalue']<0.02].sort_values('pvalue')