In [1]:
#%pylab inline
#pylab.rcParams['figure.figsize'] = (10, 6)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import random
import nltk
from random import randint
from collections import OrderedDict,Counter

In [2]:
df = pd.read_csv('GenderEven207.csv')

In [69]:
def preprocess(text,n_tags=None,filt=True,lower=False):
    """
    Input text string, return tokenized, postagged, filted tokens.
    """
    if filt:
        negative_tags = ['$',"''",'(',')','--','.',',',':','DT','CC','IN','TO']
        if n_tags:
            negative_tags.extend(n_tags)
    else:
        negative_tags = []
    
    if lower:
        text = text.lower()
    tokens = nltk.word_tokenize(text)
    tagged_tokens = nltk.pos_tag(tokens)
    filted_tokens = [(tok,pos) for (tok,pos) in tagged_tokens if pos not in negative_tags]
    
    return filted_tokens   
    
    
def freqdist_by_cat(df,topN=10,focus=None,gender=None,ppraw='PP_RAW'):
    """Find out how word usage based on gender and focus difference"""
    c = Counter()
    
    if focus and gender:
        _df = df.loc[df['FOCUS'] == focus & df['GENDER'] == gender, ppraw]
    else:
        if gender:
            _df = df.loc[df['GENDER'] == gender, ppraw]    
        elif focus:
            _df = df.loc[df['FOCUS'] == focus, ppraw]
        else:
            _df = df[ppraw]
            
    for s in _df:
        c.update(s)
    
    return c.most_common(topN)
    
    
def catwelgo_topNs(df,cw,topN=20,ppraw='PP_RAW',pos_filter=False,n_tags=None):
    """Find out word useage based on category and wellgorithms"""
    c = Counter() 
    for x in df[cw]:
        c.update(x.split(','))
        
    cws = sorted(c.keys())
    dcw = {}
    _n_tags = []
    _n_tags.extend(n_tags)
    
    for t in cws:
        t_seq = []
        for idx,row in df.iterrows():
            if t in row[cw].split(','):
                if pos_filter:
                    t_seq.extend([(w,p) for w,p in row[ppraw] if p not in _n_tags])
                else:    
                    t_seq.extend(row[ppraw])
        dcw.update({t:t_seq})
        
    counted = list(map(lambda t: Counter(dcw[t]).most_common(topN), cws))
    
    _df = pd.DataFrame(index=cws,columns=range(1,topN+1),data=counted)
    
    return _df
        

In [6]:
df['PP_RAW'] = None

for idx,row in df.iterrows():
    row['PP_RAW'] = preprocess(row['RAW'])

In [7]:
df

Unnamed: 0,RAW,CATS,WELLGOS,FOCUS,N,GENDER,PP_RAW
0,"Throughout my drinking career, instant gratifi...","PROGRESS / start small,INNER MASTERY / take a ...","Baby Step Embracer,Finish Line Forgetter,The M...",DRUGS/ALC,Paul,male,"[(my, PRP$), (drinking, NN), (career, NN), (in..."
1,I tried to control every aspect of my life. Wh...,"CONTROL,SERENITY,FREEDOM / past & let go,TRUST...","Controller Coaster,Should Shover,Serenity Simu...",DRUGS/ALC,Paul,male,"[(I, PRP), (tried, VBD), (control, VB), (aspec..."
2,Thinking about drinking – the taste and feel –...,"CRAVINGS / will power & tools,INNER POWER / Se...","Pattern Interrupter,New Pattern Painter,Craver...",DRUGS/ALC,Paul,male,"[(Thinking, VBG), (drinking, NN), (–, VBP), (t..."
3,I always thought today/this weekend/this week ...,"DELUSION,CONFESSION / denial,TRUTH / self honesty","Delusion Detector,Sabotage SOS’r,Insanity SOS’...",DRUGS/ALC,Paul,male,"[(I, PRP), (always, RB), (thought, VBD), (toda..."
4,The people around me only saw what I wanted th...,"CONFIDENCE / acceptance & worth,DELUSION,VULNE...","Self Esteem Elixir,Confidence Caffeinator,Inne...",DRUGS/ALC,Paul,male,"[(people, NNS), (me, PRP), (only, RB), (saw, V..."
5,Resigned to the fact that being poor or homele...,"CONFIDENCE / hope & future,FREEDOM / past & l...","Excuse Exterminator,Rationalization Rinser,Gra...",DRUGS/ALC,Paul,male,"[(Resigned, VBN), (fact, NN), (being, VBG), (p..."
6,"Alcohol was great for two things, suppressing ...","WORTHINESS / connections & sharing,VULNERABILI...","Emotions Engineer,Feelings Fantasizer,Compassi...",DRUGS/ALC,Paul,male,"[(Alcohol, NNP), (was, VBD), (great, JJ), (two..."
7,I’ve started more things than I can count and ...,"COMMITMENTS,MOTIVATION,RESPONSIBILITY,INNER PO...","Finish Line Forgetter,The Procrastina-shunner,...",DRUGS/ALC,Paul,male,"[(I’ve, NNP), (started, VBD), (more, JJR), (th..."
8,How great it was to have a friend that I could...,"EGO,PRIDE ,INFLUENCES,COMMUNITY / healthy rela...","Foul Weather Friender,Arrogance Arrestor,Pride...",DRUGS/ALC,Paul,male,"[(How, WRB), (great, JJ), (it, PRP), (was, VBD..."
9,Diving headlong into whatever had caught my in...,"COMMITMENTS,PROGRESS / start small,INNER POWER...","Symbolic Stepper,New Pattern Painter / grit & ...",DRUGS/ALC,Paul,male,"[(Diving, VBG), (headlong, NN), (whatever, WDT..."


In [42]:
afag = freqdist_by_cat(df,30) #all focus, all gender
afom = freqdist_by_cat(df,30,gender='male') #all focus / male
afof = freqdist_by_cat(df,30,gender='female') #all focus / female

df_g_w = pd.DataFrame(columns=['ALL','MALE','FEMALE'])
df_g_w['ALL'] = afag
df_g_w['MALE'] = afom
df_g_w['FEMALE'] = afof

print('Male average words: %4.2f' %(sum([len(s) for s in df.loc[df['GENDER'] == 'male', 'PP_RAW']])/207)) # Male average words
print('Female average words: %4.2f' %(sum([len(s) for s in df.loc[df['GENDER'] == 'female', 'PP_RAW']])/207)) #Female average words
print('Total words Male: ', sum([len(s) for s in df.loc[df['GENDER'] == 'male', 'PP_RAW']])) # Male total words
print('Total words Female: ', sum([len(s) for s in df.loc[df['GENDER'] == 'female', 'PP_RAW']])) # Female total words

Male average words: 114.51
Female average words: 85.41
Total words Male:  23703
Total words Female:  17679


In [29]:
df_g_w.to_csv('Raw/W-fdist_Top30-gender.csv',index=False)

In [28]:
df_g_w

Unnamed: 0,ALL,MALE,FEMALE
0,"((I, PRP), 3829)","((I, PRP), 2281)","((I, PRP), 1548)"
1,"((my, PRP$), 1529)","((my, PRP$), 789)","((my, PRP$), 740)"
2,"((me, PRP), 772)","((was, VBD), 607)","((is, VBZ), 373)"
3,"((was, VBD), 735)","((it, PRP), 427)","((me, PRP), 360)"
4,"((it, PRP), 617)","((me, PRP), 412)","((am, VBP), 309)"
5,"((is, VBZ), 596)","((n't, RB), 246)","((not, RB), 221)"
6,"((am, VBP), 356)","((had, VBD), 227)","((have, VBP), 211)"
7,"((not, RB), 317)","((is, VBZ), 223)","((it, PRP), 190)"
8,"((have, VBP), 312)","(('d, MD), 187)","((recovery, NN), 175)"
9,"((myself, PRP), 311)","((myself, PRP), 186)","((can, MD), 150)"


In [65]:
%%time
countwelgo = catwelgo_topNs(df,'WELLGOS')
countcats = catwelgo_topNs(df,'CATS')

Wall time: 39.1 s


In [70]:
countwelgo.to_csv('Raw/Cats_W-most_common-Top20.csv')
countcats.to_csv('Raw/Welgs_W-most_common-Top20.csv')

In [79]:
countwelgo_flt = catwelgo_topNs(df,'WELLGOS',pos_filter=True,n_tags=['PRP','PRP$','VBD','RB','VBZ','VBP','VB','MD','VB'])
countcats_flt = catwelgo_topNs(df,'CATS',pos_filter=True,n_tags=['PRP','PRP$','VBD','RB','VBZ','VBP','VB','MD','VB'])

In [81]:
countwelgo_flt.to_csv('Raw/Cats_W-most_common-Top20_filted.csv')
countcats_flt.to_csv('Raw/Welgs_W-most_common-Top20_filted.csv')

In [80]:
countcats_flt

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
"""WILL POWER\n""","((I’m, NNP), 3)","((better, JJR), 2)","((matter, NN), 1)","((way, NN), 1)","((how, WRB), 1)","((deaf, NN), 1)","((listening, NN), 1)","((tough, JJ), 1)","((listener, NN), 1)","((what, WP), 1)","((gut, NN), 1)","((part, NN), 1)","((willpower, VBN), 1)","((mastered, JJ), 1)","((ready, NN), 1)","((more, JJR), 1)","((ego, NN), 1)","((more, RBR), 1)","((little, JJ), 1)","((guy, NN), 1)"
ABSTINENCE,"((things, NNS), 8)","((abstinence, NN), 7)","((doing, VBG), 6)","((life, NN), 6)","((friends, NNS), 6)","((drugs, NNS), 6)","((way, NN), 6)","((that, WDT), 5)","((addiction, NN), 5)","((who, WP), 5)","((up, RP), 4)","((reward, NN), 4)","((what, WP), 4)","((sobriety, NN), 4)","((family, NN), 4)","((able, JJ), 4)","((ride, NN), 4)","((time, NN), 4)","((charge, NN), 4)","((one, CD), 4)"
ACCOUNTABILITY / Amends,"((amends, NNS), 8)","((going, VBG), 8)","((people, NNS), 6)","((detox, NN), 4)","((who, WP), 4)","((what, WP), 4)","((damage, NN), 4)","((behavior, NN), 4)","((how, WRB), 4)","((others, NNS), 4)","((friends, NNS), 4)","((addiction, NN), 4)","((done, VBN), 4)","((life, NN), 3)","((list, NN), 3)","((recovery, NN), 3)","((much, JJ), 3)","((that, WDT), 3)","((way, NN), 3)","((Making, VBG), 2)"
ACCOUNTABILITY / De-blame,"((using, VBG), 3)","((blame, NN), 3)","((life, NN), 2)","((recovery, NN), 2)","((responsibility, NN), 2)","((part, NN), 2)","((game, NN), 2)","((Ditching, VBG), 2)","((something, NN), 2)","((back-and-forth, JJ), 1)","((kept, FW), 1)","((victim, NN), 1)","((changing, NN), 1)","((“It, JJ), 1)","((being, VBG), 1)","((fault, NN), 1)","((why, WRB), 1)","((better, RBR), 1)","((given, VBN), 1)","((change, NN), 1)"
ACCOUNTABILITY / Own up,"((actions, NNS), 7)","((mistakes, NNS), 7)","((things, NNS), 6)","((someone, NN), 6)","((When, WRB), 6)","((responsibility, NN), 6)","((own, JJ), 6)","((time, NN), 6)","((accountable, JJ), 5)","((how, WRB), 5)","((making, VBG), 5)","((recovery, NN), 5)","((excuses, NNS), 5)","((people, NNS), 5)","((friends, NNS), 5)","((addiction, NN), 5)","((pain, NN), 5)","((hostility, NN), 4)","((who, WP), 4)","((when, WRB), 4)"
ACCOUNTABILITY / damage confess,"((been, VBN), 16)","((damage, NN), 15)","((life, NN), 14)","((addiction, NN), 12)","((friends, NNS), 10)","((drugs, NNS), 10)","((years, NNS), 8)","((hardships, NNS), 6)","((that, WDT), 6)","((up, RP), 5)","((how, WRB), 5)","((problem, NN), 5)","((done, VBN), 5)","((something, NN), 5)","((who, WP), 5)","(('s, POS), 5)","((lot, NN), 5)","((others, NNS), 5)","((detox, NN), 4)","((wrong, JJ), 4)"
AGONY,"((depression, NN), 6)","((misery, NN), 6)","((that, WDT), 5)","((life, NN), 4)","((way, NN), 4)","((when, WRB), 4)","((more, RBR), 4)","((cravings, NNS), 4)","((addiction, NN), 4)","((one, CD), 4)","((day, NN), 4)","((unhealthy, JJ), 3)","((who, WP), 3)","((bad, JJ), 3)","((–, NNP), 3)","((own, JJ), 3)","((time, NN), 3)","((food, NN), 3)","((beautiful, JJ), 2)","((years, NNS), 2)"
AMENDS,"((amends, NNS), 15)","((making, VBG), 6)","((going, VBG), 6)","((people, NNS), 6)","((what, WP), 5)","((that, WDT), 5)","((mistakes, NNS), 5)","((others, NNS), 5)","((past, JJ), 4)","((when, WRB), 4)","((someone, NN), 4)","((damage, NN), 4)","((behavior, NN), 4)","((recovery, NN), 4)","((more, JJR), 4)","((one, CD), 4)","((something, NN), 3)","((Making, VBG), 3)","((list, NN), 3)","((things, NNS), 3)"
ANGER,"((anger, NN), 14)","((angry, JJ), 5)","((way, NN), 5)","((positive, JJ), 4)","((more, RBR), 4)","((feeling, NN), 4)","((Anger, NNP), 4)","((life, NN), 3)","((God, NNP), 3)","((When, WRB), 3)","((emotion, NN), 3)","((one, CD), 3)","((detox, NN), 2)","((angerexic, NN), 2)","((someone, NN), 2)","((time, NN), 2)","((heart, NN), 2)","((order, NN), 2)","((person, NN), 2)","((feelings, NNS), 2)"
ANXIETY,"((anxiety, NN), 9)","((situation, NN), 6)","((problem, NN), 5)","((more, JJR), 4)","((When, WRB), 4)","((exiled, VBN), 3)","((that, WDT), 3)","((pain, NN), 3)","((being, VBG), 2)","((program, NN), 2)","((life, NN), 2)","((problems, NNS), 2)","((God, NNP), 2)","((free, JJ), 2)","((rid, JJ), 2)","((normal, JJ), 2)","((panic, JJ), 2)","((solution, NN), 2)","((exiler, NN), 2)","((one, CD), 2)"


In [8]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [51]:
l

[1, 2, 3, 4, 5]