# Sample and label comment questions
Now that we've collected comments from some advice subreddits, let's label them to determine if they are valid questions.

### Load data

In [8]:
import pandas as pd
import os
data_dir = '../../data/reddit_data/'
comment_data_files = list(filter(lambda x: x.startswith('subreddit_comments'), os.listdir(data_dir)))
comment_data_files = list(map(lambda x: os.path.join(data_dir, x), comment_data_files))
## load all data
import json
import gzip
def load_json_data(data_file):
    data = []
    for l_i in gzip.open(data_file, 'rt'):
        data_i = json.loads(l_i.strip())
        data.append(data_i)
    data = pd.DataFrame(data)
    return data
comment_data = pd.concat(list(map(lambda x: load_json_data(x), comment_data_files)), axis=0)
display(comment_data.head())
print(f'{comment_data.shape[0]} comments')

Unnamed: 0,author,author_flair_text,author_fullname,body,created_utc,edited,id,parent_id,score,subreddit
0,grumpypantsoldman,,t2_27ps6lxw,NTA. I think you dodged a bullet. Who needs a ...,1541030405,False,e8tkic1,t3_9t3n27,14,AmItheAsshole
1,unknown_salmon,,t2_q57txtp,I do feel for you. Do you think you could be b...,1541030426,False,e8tkj26,t3_9t1u2e,2,Advice
2,vld-s,,t2_14p3sr,YNTA. He made you uncomfortable and you distan...,1541030467,False,e8tkkjt,t3_9t3xz3,3,AmItheAsshole
3,Prepperpoints2Ponder,,t2_11lpfa,Manufacturing here. Me and spouse will be payi...,1541030470,False,e8tkkns,t3_9t2e8e,2,personalfinance
4,juliej891,,t2_1mwezovi,I think you’re going to go out of your way to ...,1541030493,False,e8tklij,t3_9t4afp,1,Advice


1510689 comments


In [23]:
## add submission data lololol
# submission_data = load_json_data('../../data/reddit_data/subreddit_submissions_2018-01_2019-12.gz')
submission_data.rename(columns={'id' : 'parent_id'}, inplace=True)
display(submission_data.head())

Unnamed: 0,author,author_flair_text,created_utc,edited,parent_id,num_comments,score,selftext,subreddit,title,category,author_fullname
0,deepsouthsloth,,1514764840,False,7nby0l,7,1,26M/married/2 kids\n\nEmployer match is 50% up...,personalfinance,Should I continue with 401k despite terrible e...,,
1,CapableCounteroffer,,1514764890,False,7nby5t,5,0,"On November 24th, I called AT&amp;T to inquire...",legaladvice,[FL] Issue getting AT&amp;T to pay early termi...,,
2,pinkcrayon69,,1514764948,False,7nbybf,9,3,I live in south OC but I need to move out of m...,personalfinance,I need to move out in a month. What should I p...,,
3,bobshellby,Needs 64bit Windows...,1514765040,False,7nbykz,6,0,Are there keycaps for the Microsoft wireless k...,pcmasterrace,Keyboard keycap help,,
4,j0sh135742,,1514765064,1514765420.0,7nbyno,4,0,"So in MGL Part 1, Title 15, Chapter 94G, Secti...",legaladvice,Quick question about Medical Marijuana.,,


In [49]:
# cleanup, etc.
# fix parent ID
comment_data = comment_data.assign(**{'parent_id' : comment_data.loc[:, 'parent_id'].apply(lambda x: x.split('_')[-1])})
# fix edited names
submission_data.rename(columns={'edited' : 'submission_edited'}, inplace=True)
comment_data.rename(columns={'edited' : 'comment_edited'}, inplace=True)
comment_submission_data = pd.merge(submission_data.loc[:, ['title', 'selftext', 'parent_id', 'submission_edited']], 
                                   comment_data, on='parent_id')
# remove edited submissions
comment_submission_data = comment_submission_data.assign(**{'submission_edited' : comment_submission_data.loc[:, 'submission_edited'].apply(lambda x: x if type(x) is bool else x > 0)})
comment_submission_data = comment_submission_data[~comment_submission_data.loc[:, 'submission_edited']]
display(comment_submission_data.head())

Unnamed: 0,title,selftext,parent_id,submission_edited,author,author_flair_text,author_fullname,body,created_utc,comment_edited,id,score,subreddit,comment_questions,valid_question
0,Should I continue with 401k despite terrible e...,26M/married/2 kids\n\nEmployer match is 50% up...,7nby0l,False,slalomz,,,You'd get slightly lower expense ratios in an ...,1514765131,False,ds0nchz,3,personalfinance,[],False
1,Should I continue with 401k despite terrible e...,26M/married/2 kids\n\nEmployer match is 50% up...,7nby0l,False,DaveAlot,,,Tax-advantaged investing beats taxable investi...,1514765137,False,ds0ncoa,4,personalfinance,[How much do you want to invest per year and h...,False
2,Should I continue with 401k despite terrible e...,26M/married/2 kids\n\nEmployer match is 50% up...,7nby0l,False,20000to0,,,Tax-advantage accounts are KING.\n\nIt does su...,1514765436,False,ds0nlee,3,personalfinance,[],False
3,[FL] Issue getting AT&amp;T to pay early termi...,"On November 24th, I called AT&amp;T to inquire...",7nby5t,False,lucasrva,,,&gt;I asked if I needed to trade in my phones ...,1514765548,False,ds0noqx,5,legaladvice,[],False
4,[FL] Issue getting AT&amp;T to pay early termi...,"On November 24th, I called AT&amp;T to inquire...",7nby5t,False,swalsh411,Quality Contributor,,That isn't even legal mumbo jumbo buried in a ...,1514766103,False,ds0o4yq,6,legaladvice,[],False


### Extract questions

In [50]:
from data_helpers import extract_questions_all_data
comment_submission_data = comment_submission_data.assign(**{
    'comment_questions' : extract_questions_all_data(comment_submission_data.loc[:, 'body'].values)
})
# check for stand-alone questions
comment_submission_data = comment_submission_data.assign(**{
    'valid_question' : comment_submission_data.apply(lambda x: len(x.loc['comment_questions']) == 1 and x.loc['comment_questions'][0]==x.loc['body'], axis=1)
})

In [51]:
## how many comments have at least one question
print(f'{comment_submission_data[comment_submission_data.loc[:, "comment_questions"].apply(lambda x: len(x)>0)].shape[0]}/{comment_data.shape[0]} comments with at least one question')

202349/1510689 comments with at least one question


In [52]:
question_data = comment_submission_data[comment_submission_data.loc[:, 'valid_question']]
print(f'{question_data.shape[0]}/{comment_submission_data.shape[0]} valid questions')
print(f'subreddit distribution\n{question_data.loc[:, "subreddit"].value_counts()}')

14593/940911 valid questions
subreddit distribution
personalfinance    5253
legaladvice        3765
pcmasterrace       3358
Advice             1717
AmItheAsshole       500
Name: subreddit, dtype: int64


Let's see a sample of questions and determine how valid they are.

In [59]:
import numpy as np
np.random.seed(123)
sample_size = 50
sample_data = []
for subreddit_i, data_i in question_data.groupby('subreddit'):
#     print(f'sample questions for subreddit {subreddit_i}')
    sample_idx_i = np.random.choice(data_i.index, sample_size, replace=False)
    sample_data_i = data_i.loc[sample_idx_i, :]
    sample_data.append(sample_data_i.loc[:, ['parent_id', 'id', 'subreddit', 'title', 'selftext', 'body']])
sample_data = pd.concat(sample_data, axis=0)
sample_data_file = '../../data/reddit_data/sample_advice_subreddit_questions.tsv'
sample_data.to_csv(sample_data_file, sep='\t', index=False)

Now that we've sampled the data, let's take the sub-sample of valid questions and send them to AMT for crowdsourced validation.

In [74]:
label_sample_data = pd.read_csv('../../data/reddit_data/sample_advice_subreddit_question_labels.tsv', sep='\t', index_col=False)
display(label_sample_data.head())

Unnamed: 0,parent_id,id,subreddit,title,selftext,body,question_is_relevant,question_is_clarification,submission_contains_answer,submission_can_include_question_answer
0,9hh9hv,e6bv7f2,Advice,"My brother, who abused me up until i was 14, w...","So, like the title says, my brother wants to t...",Do you want to forgive and forget or are you c...,1,1,0,1
1,9ag3ri,e4w8nbk,Advice,GF is pissed because I brought my best friends...,Basically my GF is freaking out on me and i do...,Were you going to tell her and how much time d...,1,1,0,1
2,9l8d4z,e74tijj,Advice,Fresh peppers,"Hi everyone! I hate fresh peppers green, red a...",Why do you wanna force yourself to do somethin...,1,1,0,1
3,9dqali,e5jf0y2,Advice,Should I buy a 1000$ pc for my best friend so ...,My best friend and I have been together ever s...,Have you considered buying it then have him ma...,1,0,0,1
4,87mh3y,dwdzcz7,Advice,Just wondering why it is when you get older yo...,I’m a (41m) and used to have a ton of close fr...,"Good to know, but would you be honest and say ...",1,0,0,1


In [75]:
valid_label_sample_data = label_sample_data[(label_sample_data.loc[:, 'question_is_relevant']==1) & 
                                            (label_sample_data.loc[:, 'question_is_clarification']==1) & 
                                            (label_sample_data.loc[:, 'submission_contains_answer']==0) & 
                                            (label_sample_data.loc[:, 'submission_can_include_question_answer']==1)]
print(f'{valid_label_sample_data.shape[0]}/{label_sample_data.shape[0]} valid questions')
## get per-subreddit counts
per_subreddit_valid_label_sample_data_counts = valid_label_sample_data.loc[:, 'subreddit'].value_counts() / label_sample_data.loc[:, 'subreddit'].value_counts()
per_subreddit_valid_label_sample_data_counts.sort_values(inplace=True, ascending=False)
print(per_subreddit_valid_label_sample_data_counts)
## output sample to file for labeling
import numpy as np
np.random.choice(123)
crowdsource_label_data = []
samples_per_subreddit = 10
for subreddit_i, data_i in valid_label_sample_data.groupby('subreddit'):
    idx_i = np.random.choice(data_i.index, samples_per_subreddit, replace=False)
    data_i = data_i.loc[idx_i, :]
    data_i.rename(columns={'selftext' : 'text', 'body' : 'question'}, inplace=True)
    crowdsource_label_data.append(data_i.loc[:, ['parent_id', 'title', 'id', 'text', 'question']])
crowdsource_label_data = pd.concat(crowdsource_label_data, axis=0)
display(crowdsource_label_data.head())
crowdsource_label_data.to_csv('../../data/reddit_data/advice_subreddit_question_data_for_crowdsource.csv', sep=',', index=False)

154/250 valid questions
legaladvice        0.88
pcmasterrace       0.66
personalfinance    0.60
Advice             0.54
AmItheAsshole      0.40
Name: subreddit, dtype: float64


Unnamed: 0,parent_id,title,id,text,question
39,a17fm4,Help motivate me PLEASE,eanh7xc,This is a little bit pathetic but I’m just sat...,How long will it take you to do your homework?
0,9hh9hv,"My brother, who abused me up until i was 14, w...",e6bv7f2,"So, like the title says, my brother wants to t...",Do you want to forgive and forget or are you c...
17,9ul9hg,Should I move to Florida with my dad or stay h...,e9571n3,Live in NYC metro area and go to college in up...,"In the other hand, you’re an adult and will mo..."
49,94o4v8,Stepped On a Rusty Nail at Work Today,e3mi4c7,"While I was working a yard, I stepped on somet...",Why does your title say stepped on today if it...
9,8zpq1u,Chronically ill and stuck on what to do about ...,e2kkpvv,"Hello, r/advice! I wanted to take a shot and s...",mind if i ask what disability you've been diag...
