# Check science reply questions
Let's look at some data related to sharing new science articles and the questions that people pose in response to the articles.

We'll see how readily we can predict author background using the questions.

In [8]:
## get Reddit data!!
from datetime import datetime
from psaw import PushshiftAPI
from tqdm import tqdm
# from data_helpers import load_reddit_api
# reddit_api, pushshift_api = load_reddit_api('../../data/auth_data/reddit_auth.csv')
pushshift_api = PushshiftAPI()
date_range = ['2020-01-01', '2021-09-01']
date_range = list(map(lambda x: int(datetime.strptime(x, '%Y-%m-%d').timestamp()), date_range))
subreddit = 'science'
filter_fields = ['url', 'title', 'author', 'score', 'text', 'created_utc', 'id', 'upvote_ratio', 'num_comments']
submissions = pushshift_api.search_submissions(q="*", after=date_range[0], before=date_range[1],
                                               subreddit=subreddit, filter=filter_fields)
submissions_results = []
for s in tqdm(submissions):
    submissions_results.append(s)

47091it [07:52, 99.66it/s] 


In [14]:
## convert to data frame
import pandas as pd
submission_data = pd.DataFrame(submissions_results)
display(submission_data.head())

Unnamed: 0,author,created_utc,id,num_comments,score,title,upvote_ratio,url,created,d_
0,talismanbrandi,1630462648,pfkdt5,3,1,Socio-economic disparities and COVID-19 in the...,1.0,https://www.reddit.com/r/science/comments/pfkd...,1630480648.0,"{'author': 'talismanbrandi', 'created_utc': 16..."
1,BeforeYourBBQ,1630462436,pfkbn2,3,1,Comparing SARS-CoV-2 natural immunity to vacci...,1.0,https://www.medrxiv.org/content/10.1101/2021.0...,1630480436.0,"{'author': 'BeforeYourBBQ', 'created_utc': 163..."
2,key__lime_pie,1630462250,pfk9q9,2,1,Scientists Figured Out How Much Exercise You N...,1.0,https://www.sciencealert.com/scientists-figure...,1630480250.0,"{'author': 'key__lime_pie', 'created_utc': 163..."
3,key__lime_pie,1630462179,pfk90j,468,1,Female octopuses throw shells at males annoyin...,1.0,https://www.independent.co.uk/climate-change/n...,1630480179.0,"{'author': 'key__lime_pie', 'created_utc': 163..."
4,Doozenburg,1630461660,pfk3ts,2,1,Who is Anti-Vax Dr. Wendy Menigoz?,1.0,https://www.slugbrain.com/post/who-is-anti-vax...,1630479660.0,"{'author': 'Doozenburg', 'created_utc': 163046..."


Let's collect all comments from the same time frame, and align them to submissions afterward.

In [None]:
date_range = ['2020-01-01', '2021-09-01']
date_range = list(map(lambda x: int(datetime.strptime(x, '%Y-%m-%d').timestamp()), date_range))
subreddit = 'science'
filter_fields = ['id', 'link_id', 'parent_id', 'body', 'author', 'created_utc', 'score']
comments = pushshift_api.search_comments(after=date_range[0], before=date_range[1],
                                         subreddit=subreddit, filter=filter_fields)
comments_results = []
for c in tqdm(comments):
    comments_results.append(c)

907905it [2:33:53, 83.42it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

1445247it [4:03:56, 98.15it/s] 

In [34]:
## combine/clean
comment_data = pd.DataFrame(comments_results)
# drop deleted data
comment_data = comment_data[(comment_data.loc[:, 'author']!='[deleted]') &
                            (comment_data.loc[:, 'body']!='[deleted]')]
# fix ID vars
comment_data = comment_data.assign(**{
    'link_id' : comment_data.loc[:, 'link_id'].apply(lambda x: x.split('_')[1]),
    'parent_id' : comment_data.loc[:, 'parent_id'].apply(lambda x: x.split('_')[1]),
})
# drop extra data
comment_data.drop('d_', axis=1, inplace=True)
display(comment_data.head())
print(comment_data.shape[0])

Unnamed: 0,author,body,created_utc,id,link_id,parent_id,score,created
0,Etna,"oh I misread, it's the males that are being an...",1630468787,hb5b6ss,pfk90j,hb5b25v,27,1630487000.0
3,lalauna,Please tell me something i didn't know before.,1630468726,hb5b2zz,pfgvrw,pfgvrw,1,1630487000.0
4,DivineBanana,I haven't followed the debate but I'm sure it'...,1630468726,hb5b2zy,pf5phr,hb58igp,3,1630487000.0
5,_MASTADONG_,I just linked you to 2 articles on fact checki...,1630468723,hb5b2sg,pfgvrw,hb5adks,4,1630487000.0
8,Etna,It's because they can't talk,1630468713,hb5b25v,pfk90j,pfk90j,37,1630487000.0


2196985


In [35]:
## restrict to comment/submission matches
submission_comment_data = pd.merge(submission_data, comment_data, left_on='id', right_on='link_id', how='inner')
# fix col names
submission_comment_data.rename(columns={
    x : x.replace('_x', '_submission') 
    for x in list(filter(lambda x: x.endswith('_x'), submission_comment_data.columns))
}, inplace=True)
submission_comment_data.rename(columns={
    x : x.replace('_y', '_comment') 
    for x in list(filter(lambda x: x.endswith('_y'), submission_comment_data.columns))
}, inplace=True)
submission_comment_data = submission_comment_data[submission_comment_data.loc[:, 'link_id']==submission_comment_data.loc[:, 'parent_id']]

In [36]:
submission_comment_data.shape[0]

465354

In [52]:
## clean text
import re
RETURN_MATCHER = re.compile('[\n\r]')
submission_comment_data = submission_comment_data.assign(**{
    'body' : submission_comment_data.loc[:, 'body'].apply(lambda x: RETURN_MATCHER.sub(' ', x))
})
## filter questions
from nltk.tokenize import sent_tokenize
submission_comment_data = submission_comment_data.assign(**{
    'reply_sents' : submission_comment_data.loc[:, 'body'].apply(lambda x: sent_tokenize(x))
})
# look for questions!
import re
question_matcher = re.compile('\?$')
submission_comment_data = submission_comment_data.assign(**{
    'reply_questions' : submission_comment_data.loc[:, 'reply_sents'].apply(lambda x: list(filter(lambda y: question_matcher.search(y) is not None, x)))
})
submission_question_data = submission_comment_data[submission_comment_data.loc[:, 'reply_questions'].apply(len)>0]
## flatten
flat_submission_question_data = []
for idx_i, data_i in submission_question_data.iterrows():
    for q_j in data_i.loc['reply_questions']:
        data_j = data_i.copy().drop('reply_questions')
        data_j.loc['reply_question'] = q_j
        flat_submission_question_data.append(data_j)
flat_submission_question_data = pd.concat(flat_submission_question_data, axis=1).transpose()
print(f'{flat_submission_question_data.shape[0]} questions total')

130878 questions total


In [56]:
## look at sample questions => clarification questions? self-contained? related to post?
## sample questions
pd.set_option('display.max_colwidth', 1000)
display(flat_submission_question_data.loc[:, 'reply_question'].iloc[:10].values)

array(['die for the economy?', 'how about we topple you instead?',
       'Are we really trying to blame covid for why my political leadership is incompetent and worthless?',
       'You mean covid unveils a psychological burden of perpetual political unrest in USA?',
       'Psychological burden of the virus itself, or the various lockdown measures that forced people to isolate?',
       'Im slow, but i believe the abstract reads that the native species are evolving to become more cannibalistic themselves eating more of the young of the invaders?',
       'So, when they collide, they destroy each other.”  [source](https://www.cam.ac.uk/research/news/astronomers-show-how-planets-form-in-binary-systems-without-getting-crushed)  Am I missing something here?',
       'How would these evictions **double** the Covid rate in an **area**?',
       'Have I misread something?',
       "Don't you care about the environment?"], dtype=object)

Let's limit the questions to have at least X words.

In [60]:
from nltk.tokenize import WordPunctTokenizer
min_question_words = 10
tokenizer = WordPunctTokenizer()
valid_submission_question_data = flat_submission_question_data[flat_submission_question_data.loc[:, 'reply_question'].apply(lambda x: len(tokenizer.tokenize(x)) >= min_question_words)]
display(valid_submission_question_data.loc[:, 'reply_question'].iloc[:10].values)

array(['Are we really trying to blame covid for why my political leadership is incompetent and worthless?',
       'You mean covid unveils a psychological burden of perpetual political unrest in USA?',
       'Psychological burden of the virus itself, or the various lockdown measures that forced people to isolate?',
       'Im slow, but i believe the abstract reads that the native species are evolving to become more cannibalistic themselves eating more of the young of the invaders?',
       'So, when they collide, they destroy each other.”  [source](https://www.cam.ac.uk/research/news/astronomers-show-how-planets-form-in-binary-systems-without-getting-crushed)  Am I missing something here?',
       'How would these evictions **double** the Covid rate in an **area**?',
       'Wait, who was saying there would be a pandemic baby boom and why?',
       "Isn't it established that stress and uncertainty eliminate the desire to be parents, or is that just my intuition?",
       'The baby boo

In [None]:
## save data for posterity!!
valid_submission_question_data.to_csv('science_submission_question_data.gz', sep='\t', compression='gzip', index=False)

In [2]:
import pandas as pd
valid_submission_question_data = pd.read_csv('science_submission_question_data.gz', sep='\t', compression='gzip')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
## TODO: mine previous history for N=10000 commenters; extract location + age + gender (?)
from tqdm import tqdm
import numpy as np
np.random.seed(123)
from psaw import PushshiftAPI
pushshift_api = PushshiftAPI()
N_commenters = 10000
N_previous_comments = 1000
sample_question_author_data = valid_submission_question_data.sort_values(['author_comment', 'created_utc_comment'], ascending=True).drop_duplicates('author_comment').loc[:, ['author_comment', 'created_utc_comment']]
sample_question_author_data = sample_question_author_data.sample(N_commenters, replace=False, random_state=123)
sample_question_author_prior_data = []
author_filter_cols = ['body', 'id', 'created_utc', 'author', 'subreddit']
for idx_i, data_i in tqdm(sample_question_author_data.iterrows()):
    author_i = data_i.loc['author_comment']
    time_i = int(data_i.loc['created_utc_comment'])
    prior_comments_i = list(pushshift_api.search_comments(author=author_i, limit=N_previous_comments, before=time_i, filter=author_filter_cols))
    prior_comments_i = pd.DataFrame(prior_comments_i)
    if('d_' in prior_comments_i.columns):
        prior_comments_i.drop('d_', axis=1, inplace=True)
    sample_question_author_prior_data.append(prior_comments_i)
sample_question_author_prior_data = pd.concat(sample_question_author_prior_data)

3099it [6:11:34,  6.67s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

10000it [20:27:42,  7.37s/it]


In [5]:
sample_question_author_prior_data.to_csv('science_submission_question_reply_author_data.gz', sep='\t', compression='gzip', index=False)

In [10]:
## reload
import pandas as pd
valid_submission_question_data = pd.read_csv('science_submission_question_data.gz', sep='\t', compression='gzip')
sample_question_author_prior_data = pd.read_csv('science_submission_question_reply_author_data.gz', sep='\t', compression='gzip')
sample_question_author_prior_data.dropna(subset=['body'], inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
pd.set_option('display.max_colwidth', 1000)
display(sample_question_author_prior_data.head())

Unnamed: 0,author,body,created_utc,id,subreddit,created
0,mahogany555,"You're not even Scottish, are you?",1582310896,fiayfoz,ScottishPeopleTwitter,1582329000.0
1,mahogany555,"'The people of Scotland' don't spell it that way, losers seeking attention on reddit do.",1582298322,fiacrvb,ScottishPeopleTwitter,1582316000.0
2,mahogany555,"Firstly, most of these people probably aren't Scots. Secondly the ones writing this are loser millennials doing it to look cool on reddit...\n\nPlease tell me normal scottish people don't do this and would thoroughly mock anyone who would.",1582291726,fia35ak,ScottishPeopleTwitter,1582310000.0
3,mahogany555,"No, I definitely don't. I just stick to the 80000000 existing words that make up the language I'm writing in. Crazy, I know...",1582258926,fi99f50,ScottishPeopleTwitter,1582277000.0
4,mahogany555,Do they that's super interesting...\n\nHere's the word 'dog' from the dictionary of that 'language'\n\nhttps://dsl.ac.uk/results/dog,1582258153,fi98eh0,ScottishPeopleTwitter,1582276000.0


In [13]:
## check for self-disclosure statements
import re
self_disclosure_matcher = re.compile(f'((I\'m|I am) a)|(I live in)|(I am from)')
print(self_disclosure_matcher.search('I live in MI'))
tmp = sample_question_author_prior_data[sample_question_author_prior_data.loc[:, 'body'].apply(lambda x: self_disclosure_matcher.search(x) is not None)]
display(tmp.loc[:, ['body']].head(10))

<re.Match object; span=(0, 9), match='I live in'>


Unnamed: 0,body
95,"This is only something I realized 2 days ago but out of nowhere I gained 18lbs.\n\nI’m really happy! I am a super underweight (29yo, 5’8) male and have never been able to gain weight. I’ve been so hungry this week and am constantly eating. \n\nI’ve been around 90-100lbs for years and weighed my self and am pushing 119lbs. LOL I know how stupid this sounds, but for me it is a fun fact.\n\nI just hope my metabolism hasn’t permanently ghosted me."
436,"is it Hardwon or Magnus from Adventure Zone? They are basically the same....buff human bearded fighters. I am also guessing Hardwon though, since his beard is so dwarvish."
540,"i have a feeling that part is a work in progress, and I am guessing it will also hinge on an actual MLB ruling.\n\nEven before this there were weird things showing up, like the choice to include Ross Barnes' 1876 as official and part of the top batting averages, but his 1873 was not included. I am assuming it was because one is for the National League and one is for the National Association, but the difference in plate appearances in those two seasons is two.\n\nThe stat nerd side of me sort of cringes at a lot of these, but another part of me wants old boomers who scream about tradition and complain about 'nerds ruining the game' to have to swallow Tetelo Vargas being the all-time batting average champ with his .471 in 30 games in 1943."
561,"The first time I closed a bank account:\n\nI needed some cash so I went to an ATM outside my bank to get $20 but the minimum was $40. Fine, it said, not realizing I had about $39.50 in my checking. Get receipt and see I am in the negative. No problem, I can just use the ATM to transfer some money from my savings. I do that, moving $20 from my savings.\n\nNext bill I am at -$4 and change because when I went negative it immediately charged me the overdraft...but didn't show it on the receipt, and the money I put on it was less than the fee.\n\nSo I went to the actual bank to argue my case and the teller tried to explain to me that they did me a favor for not charging me TWO (2) overdraft fees because I had gone negative then positive then back to negative. \n\nSo I did the only thing I could do, I closed both my savings and checking and went to a credit union."
670,yeah. I am a tiger fan and I have always hated the white sox. Then the last few years while they were building this fun young team I started to like watching them. Then they hired LaRussa. I HATE LaRussa more than I hate the White Sox.
682,The picture used here though is from the second episode when a horny disease hits the enterprise.\n\nSource: I just started watching TNG for the first time so I am an 'expert' on episodes 1 and 2.
748,"Maybe I am a bit cynical. But using your definition we can clearly see we are not at the point you think we are.\n\nThis good young core? We have 5 top prospects. Two are pitchers in their first full seasons and two are batters, one who is still only 19 and basically didn't play last season and the other just finished his first professional spring training. Where would you expect them to be?\n\nWhat we do have are a bunch of AAA of AAAA talents who are holding roster spots until we get to that next point.\n\n\nlooking at this current roster I cannot imagine thinking we should be good, or average even, at hitting."
812,"Yeah, there seems to be backlash all over when it comes to the changing of traditional gender norms and being more open with your feelings, but if the result is a generation of men who can deal with their emotions in a healthy way and not just feel alone then I am all for it. But that is sort of what seems to be lost in these types of debates."
813,I live in Boston. It did rain sort of hard for a bit...but they could have played.
820,Personally I am a steroid apologist so personally I wouldn't lose my shit. But do you think r/baseball would be this nuanced if he suddenly went .320/.400//.550?


In [14]:
COPULA_LEMMA = 'be'
EXIST_LEMMA = 'live'
GENDER_MATCHER = re.compile('^(man|woman|male|female)$')
AGE_NUM_MATCHER = re.compile('[0-9]+')
def collect_propn(token):
    loc_noun_parts = [token]
    location_children = list(filter(lambda x: x.pos_=='PROPN', token.children))
    while(len(location_children) > 0):
        loc_noun_part = location_children.pop()
        loc_noun_parts.append(loc_noun_part)
        location_children += list(filter(lambda x: x.pos_=='PROPN', loc_noun_part.children))
    # sort noun parts
    loc_noun_parts = list(sorted(loc_noun_parts, key=lambda x: x.idx))
    loc_noun = ' '.join(list(map(lambda x: x.lemma_, loc_noun_parts)))
    return loc_noun
def extract_self_statement_targets(sent, pipeline):
    parse = pipeline(sent)
    identity_attributes = []
    parse_sents = list(parse.doc.sents)
    for parse_sent in parse_sents:
        for token in parse_sent:
            # get children nouns for "I" via root
            if(token.lemma_ == 'I' and token.dep_ == 'nsubj'):
                token_root_ancestors = list(filter(lambda x: x.dep_=='ROOT', token.ancestors))
    #             print(f'parse ents = {list(map(lambda x: x.label_, parse.ents))}')
                loc_ents = list(filter(lambda x: x.label_=='GPE', parse.ents))
                if(len(token_root_ancestors) > 0):
                    token_root = token_root_ancestors[0]
                    root_children = list(token_root.children)
                    if(token_root.lemma_ == COPULA_LEMMA):
                        for child in root_children:
                            if(child.dep_ == 'attr'):
                                # gender
                                gender_match = GENDER_MATCHER.match(child.lemma_)
                                if(gender_match is not None):
                                    identity_attributes.append(['gender', gender_match.group(0)])
                                # age => NOPE false positives abound
#                                 age_match = AGE_NUM_MATCHER.match(child.lemma_)
#                                 if(age_match is not None):
#                                     identity_attributes.append(['age', age_match.group(0)])
                            # age
                            elif(child.dep_ == 'acomp'):
                                if(child.lemma_ == 'old'):
                                    # look for children ("30 years old")
                                    age_children_1 = list(child.children)
                                    if(len(age_children_1) > 0 and age_children_1[0].lemma_=='year'):
                                        age_children_2 = list(age_children_1[0].children)
                                        if(len(age_children_2) > 0):
                                            age_match = AGE_NUM_MATCHER.match(age_children_2[0].lemma_)
                                            if(age_match is not None):
                                                identity_attributes.append(['age', age_match.group(0)])
                            # location
                            elif(child.dep_ == 'prep' and child.lemma_ == 'from'):
                                location_children_1 = list(child.children)
                                if(len(location_children_1) > 0 and location_children_1[0].pos_ == 'PROPN'):
                                    ent_start = location_children_1[0].i
    #                                 child_1_idx = location_children_1[0].idx
    #                                 child_1_ent = 
    #                                 main_loc = location_children_1[0]
    #                                 loc_noun = collect_propn(main_loc)
                                    # find ENT that contains child
    #                                 print(f'ent start = {ent_start}')
    #                                 print(f'{[(x.start, x.end) for x in loc_ents]}')
                                    containing_loc_ents = list(filter(lambda x: x.start <= ent_start and x.end >= ent_start, loc_ents))
    #                                 print(f'containing loc ents {containing_loc_ents}')
                                    if(len(containing_loc_ents) > 0):
                                        loc_noun = containing_loc_ents[0].text
                                        identity_attributes.append(['location', loc_noun])
                    # "I live in the US"
                    elif(token_root.lemma_ == EXIST_LEMMA):
                        root_prep_children = list(filter(lambda x: x.lemma_=='in' and x.dep_=='prep', token_root.children))
                        if(len(root_prep_children) > 0):
                            prep_children_2 = list(filter(lambda x: x.pos_ == 'PROPN', root_prep_children[0].children))
                            if(len(prep_children_2) > 0):
                                ent_start = prep_children_2[0].i
                                containing_loc_ents = list(filter(lambda x: x.start <= ent_start and x.end >= ent_start, loc_ents))
                                if(len(containing_loc_ents)):
                                    loc_noun = containing_loc_ents[0].text
                                    identity_attributes.append(['location', loc_noun])
    return identity_attributes

In [15]:
import re
# dumb
# self_statement = '(I\'m|I am)'
# age_matcher = f'({self_statement} [0-9]+ years old)|({self_statement} a [0-9]+ yo )'
# gender_matcher = f'({self_statement} a[ a-zA-Z]? (man|woman|male|female))'
# smart => parse then extract
import spacy
nlp_pipeline = spacy.load('en_core_web_sm')
# gender_dep = attr ("I'm a man")
# age_dep = acomp ("I'm 50 years old")
# loc_dep = prep ("I live in Michigan")
test_sents = [
    'I am 30 years old',
    'I am a woman',
    'I live in London, England',
]
for sent in test_sents:
    sent_attr = extract_self_statement_targets(sent, nlp_pipeline)
    print(f'sent = {sent} has attr {sent_attr}')

sent = I am 30 years old has attr [['age', '30']]
sent = I am a woman has attr [['gender', 'woman']]
sent = I live in London, England has attr [['location', 'London']]


OK! We have an extremely brittle attribute extraction pipeline.

In [None]:
## extract all attributes!!
# from tqdm import tqdm
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=10, progress_bar=True)
# tqdm.pandas()ai 
sample_question_author_prior_data = sample_question_author_prior_data.assign(**{
    'identity_attributes' : sample_question_author_prior_data.loc[:, 'body'].parallel_apply(lambda x: extract_self_statement_targets(x, nlp_pipeline))
})

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=680400), Label(value='0 / 680400')…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [107]:
question_author_attribute_data = sample_question_author_prior_data[sample_question_author_prior_data.loc[:, 'identity_attributes'].apply(lambda x: len(x) > 0)]
# get one line per author
flat_author_attribute_data = []
for author_i, data_i in tqdm(question_author_attribute_data.groupby('author')):
    attr_data_i = []
    for idx_j, data_j in data_i.iterrows():
        # keep track of dates!!
        date_j = data_j.loc['created_utc']
        for attr_k, val_k in data_j.loc['identity_attributes']:
            attr_data_i.append({
                'author' : author_i,
                'date' : date_j,
                'attr' : attr_k,
                'val' : val_k
            })
    attr_data_i = pd.DataFrame(attr_data_i)
    attr_data_i.sort_values(['attr', 'date'], inplace=True, ascending=False)
    attr_data_i = attr_data_i.drop_duplicates(['attr'], keep='first').drop('date', axis=1)    
    attr_data_i = attr_data_i.pivot(index='author', columns=['attr'], values=['val']).reset_index()
    attr_data_i.columns = list(map(lambda x: x[0] if x[1]=='' else x[1], attr_data_i.columns))
    flat_author_attribute_data.append(attr_data_i)
flat_author_attribute_data = pd.concat(flat_author_attribute_data, axis=0)
display(flat_author_attribute_data.head())

100%|██████████| 98/98 [00:00<00:00, 149.88it/s]


Unnamed: 0,author,location,gender,age
0,0wnzl1f3,Canada,,
0,A4_Ts,California,,
0,Born2Rune,UK,male,
0,Bwanatumbo,,man,3.0
0,Check_My_Dubs_Friend,,woman,


In [108]:
all_attr = ['location', 'gender', 'age']
for attr_i in all_attr:
    print(flat_author_attribute_data.loc[:, attr_i].value_counts().head(10))

Canada           6
US               4
UK               3
California       2
Vancouver        2
Texas            2
America          2
Sweden           1
Oklahoma City    1
SF               1
Name: location, dtype: int64
man       20
woman      9
male       3
female     2
Name: gender, dtype: int64
3      1
10     1
33     1
500    1
24     1
2      1
22     1
1      1
69     1
Name: age, dtype: int64


Let's look at the label distribution.

In [None]:
## TODO: taxonomy of clarification questions