# Compute data summary statistics
Now that we've organized and cleaned the question data, let's compute some statistics.

In [7]:
# load data
import pandas as pd
# data = pd.read_csv('../../data/reddit_data/combined_data_question_data.gz', sep='\t', compression='gzip')
comment_data = pd.read_csv('../../data/reddit_data/advice_subreddit_filter_comment_question_data.gz', sep='\t', compression='gzip')
post_data = pd.read_csv('../../data/reddit_data/subreddit_submissions_2018-01_2019-12.gz', sep='\t', compression='gzip', index_col=False, usecols=['id', 'selftext'])
post_data.rename(columns={'selftext' : 'article_text', 'id' : 'parent_id'}, inplace=True)
data = pd.merge(comment_data, post_data, on=['parent_id'], how='left')
print(f'{data.shape[0]} total questions')
print(f'{data.loc[:, "parent_id"].nunique()} total posts')

734967 total questions
272467 total posts


## Text statistics

In [8]:
# load tokenizer
# import torch
# model_tokenizer = torch.load('../../data/reddit_data/BART_tokenizer.pt')
from nltk.tokenize import WordPunctTokenizer
word_tokenizer = WordPunctTokenizer()
data = data.assign(**{
    'post_tokens' : data.loc[:, 'article_text'].apply(lambda x: word_tokenizer.tokenize(x)),
    'question_tokens' : data.loc[:, 'question'].apply(lambda x: word_tokenizer.tokenize(x)),
})

In [9]:
import numpy as np
post_len = data.loc[:, "post_tokens"].apply(lambda x: len(x)).values
question_len = data.loc[:, "question_tokens"].apply(lambda x: len(x)).values
print(f'mean post length = {np.mean(post_len)} +/- {np.std(post_len)}')
print(f'mean question length = {np.mean(question_len)} +/- {np.std(question_len)}')

mean post length = 304.5176422887014 +/- 220.76208617123916
mean question length = 13.887307865523214 +/- 8.07912311345707


## Author data
What % of the data includes metadata about authors?

In [69]:
author_data = pd.read_csv('../../data/reddit_data/author_data/combined_author_prior_comment_data.gz', sep='\t', compression='gzip')
# fix date var
from datetime import datetime
author_data = author_data.assign(**{'date_day_bin' : pd.Series(author_data.loc[:, 'date_day_bin'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d')).values, dtype='object')})
# author_data = author_data.assign(**{'date_day_bin' : author_data.loc[:, 'date_day_bin'].apply(lambda x: x.to_pydatetime()).values})
display(author_data.head())

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,author,date_day,subreddit,expert_pct,relative_time,expert_pct_bin,relative_time_bin,age,location,location_region,date_day_bin,text_embed,subreddit_embed
0,Battleboy43,2018-06-02,pcmasterrace,0.0,78340.0,0.0,0.0,-1.0,UNK,UNK,2018-01-01 00:00:00,,
1,m4xk0,2018-07-13,pcmasterrace,0.0,69552.0,0.0,0.0,-1.0,UNK,UNK,2018-07-01 00:00:00,,
2,hAbadabadoo22,2019-02-06,AmItheAsshole,0.0,8878.0,0.0,0.0,-1.0,UNK,UNK,2019-01-01 00:00:00,,
3,haysu-christo,2018-11-04,personalfinance,0.0,86011.0,0.0,1.0,-1.0,UNK,UNK,2018-07-01 00:00:00,,
4,haysu-christo,2018-11-05,personalfinance,0.0,74117.0,0.0,0.0,-1.0,UNK,UNK,2018-07-01 00:00:00,,


In [11]:
valid_authors = set(author_data.loc[:, 'author'].unique())
valid_author_data = data[data.loc[:, 'author'].isin(valid_authors)]
print(f'{valid_author_data.shape[0]/data.shape[0]*100}% questions can be connected to an author with some kind of data')

64.78739861789713% questions can be connected to an author with some kind of data


Any difference for discrete and continuous representation?

In [12]:
continuous_vars = ['text_embed', 'subreddit_embed']
valid_continuous_var_authors = set(author_data.dropna(subset=continuous_vars, how='all').loc[:, 'author'].unique())
continuous_var_author_data = data[data.loc[:, 'author'].isin(valid_continuous_var_authors)]
print(f'{continuous_var_author_data.shape[0]/data.shape[0]*100}% questions can be connected to an author with a continuous representation')

8.82883177067814% questions can be connected to an author with a continuous representation


## Text+author data
What is the representation of authors in the text data?

We want % of comments with each reader group and embedding type.

In [10]:
# ## get author+text data from cleaned generation data
import torch
train_data = torch.load('../../data/reddit_data/combined_data_train_data.pt')
test_data = torch.load('../../data/reddit_data/combined_data_val_data.pt')
# convert to dataframe because it's easier to combine
train_data_df = train_data.data.to_pandas()
test_data_df = test_data.data.to_pandas()
import pandas as pd
author_vars = ['reader_token_str', 'author_has_subreddit_embed', 'author_has_text_embed', 'article_id']
post_author_data = pd.concat([
    train_data_df.loc[:, author_vars], 
    test_data_df.loc[:, author_vars], 
], axis=0)

In [12]:
## add subreddit data
submission_data = pd.read_csv('../../data/reddit_data/subreddit_submissions_2018-01_2019-12.gz', sep='\t', compression='gzip', usecols=['id', 'subreddit'])
submission_data.rename(columns={'id' : 'article_id'}, inplace=True)
if('subreddit' not in post_author_data.columns):
    post_author_data = pd.merge(post_author_data, submission_data, on='article_id', how='left')

In [16]:
sample_pct = 0.25
## reader groups
def get_reader_group_counts(data):
    reader_group_counts = data.loc[:, 'reader_token_str'].value_counts()
    est_reader_group_counts = reader_group_counts / sample_pct
    reader_group_pct = reader_group_counts / reader_group_counts.sum()
    reader_group_count_data = pd.concat([reader_group_pct, est_reader_group_counts], axis=1)
    reader_group_count_data.columns = ['reader_group_pct', 'reader_group_count']
    reader_group_count_data.sort_index(inplace=True)
    # get embed counts
    reader_embed_counts = pd.Series([
        data.loc[:, 'author_has_subreddit_embed'].sum(),
        data.loc[:, 'author_has_text_embed'].sum(),
    ])
    est_reader_embed_counts = reader_embed_counts / sample_pct
    reader_embed_pct = reader_embed_counts / data.shape[0]
    reader_embed_count_data = pd.concat([reader_embed_pct, est_reader_embed_counts], axis=1)
    reader_embed_count_data.columns = ['reader_group_pct', 'reader_group_count']
    reader_embed_count_data.index = ['subreddit_embed', 'text_embed']
    reader_count_data = pd.concat([reader_group_count_data, reader_embed_count_data], axis=0)
    return reader_count_data

In [17]:
reader_count_data = get_reader_group_counts(post_author_data)
print(reader_count_data)

                          reader_group_pct  reader_group_count
<EXPERT_PCT_0_AUTHOR>             0.252125            257188.0
<EXPERT_PCT_1_AUTHOR>             0.010391             10600.0
<NONUS_AUTHOR>                    0.012811             13068.0
<RESPONSE_TIME_0_AUTHOR>          0.089918             91724.0
<RESPONSE_TIME_1_AUTHOR>          0.172598            176064.0
<US_AUTHOR>                       0.018638             19012.0
UNK                               0.443518            452424.0
subreddit_embed                   0.094902             96808.0
text_embed                        0.097137             99088.0


In [18]:
## per-subreddit coverage
for subreddit_i, data_i in post_author_data.groupby('subreddit'):
    print(f'**** subreddit={subreddit_i} ****')
    reader_count_data_i = get_reader_group_counts(data_i)
    print(reader_count_data_i)

**** subreddit=Advice ****
                          reader_group_pct  reader_group_count
<EXPERT_PCT_0_AUTHOR>             0.321220             44480.0
<EXPERT_PCT_1_AUTHOR>             0.019441              2692.0
<NONUS_AUTHOR>                    0.022329              3092.0
<RESPONSE_TIME_0_AUTHOR>          0.094142             13036.0
<RESPONSE_TIME_1_AUTHOR>          0.246519             34136.0
<US_AUTHOR>                       0.023109              3200.0
UNK                               0.273239             37836.0
subreddit_embed                   0.159021             22020.0
text_embed                        0.163383             22624.0
**** subreddit=AmItheAsshole ****
                          reader_group_pct  reader_group_count
<EXPERT_PCT_0_AUTHOR>             0.179153             74448.0
<EXPERT_PCT_1_AUTHOR>             0.007912              3288.0
<NONUS_AUTHOR>                    0.011541              4796.0
<RESPONSE_TIME_0_AUTHOR>          0.103553             43

- `LOCATION`: skewed representation in `personalfinance`
- `EXPERT`: fewer "experts" in `AmItheAsshole`, `personalfinance`
- `RESPONSE`: more "short" responders in `AmItheAsshole` (more first-time posters?)
- `embeds`: less embed coverage in `AmItheAsshole`, `personalfinance` (more first-time posters?)

In [19]:
# buggy code to convert raw data to post+author data
# import pandas as pd
# from datetime import datetime
# import pytz
# post_author_data = pd.read_csv('../../data/reddit_data/combined_data_question_data.gz', 
#                                sep='\t', index_col=False, compression='gzip',
#                                usecols=['article_id', 'created_utc', 'id', 'author'])
# post_author_data = post_author_data.assign(**{'date' : post_author_data.loc[:, 'created_utc'].apply(lambda x: datetime.fromtimestamp(x, tz=pytz.utc).replace(tzinfo=None))})
# ## convert to date day bins
# from importlib import reload
# import data_helpers
# reload(data_helpers)
# from data_helpers import assign_date_bin
# import numpy as np
# author_date_bins = author_data.loc[:, 'date_day_bin'].unique()
# author_date_bins = np.array(list(map(lambda x: x.timestamp(), author_date_bins)))
# post_author_data = post_author_data.assign(**{
#     'date_day_bin' : post_author_data.loc[:, 'date'].apply(lambda x: assign_date_bin(x.timestamp(), author_date_bins, convert_timezone=False))
# })
# display(post_author_data.head())
# dynamic_author_vars = ['relative_time_bin', 'expert_pct_bin', 'text_embed', 'subreddit_embed']
# static_author_vars = ['location_region']
# combined_author_post_data = post_author_data.copy()
# for dynamic_author_var_i in dynamic_author_vars:
#     combined_author_post_data = pd.merge(combined_author_post_data, author_data.loc[:, [dynamic_author_var_i, 'author', 'date_day_bin']], on=['author', 'date_day_bin'], how='left')
# for static_var_i in static_author_vars:
#     combined_author_post_data = pd.merge(combined_author_post_data, author_data.loc[:, [static_author_var_i, 'author']], on='author', how='left')

### Debug: joining with embedding data

Why are we losing so many of the embeddings?

In [11]:
## where are all the embeddings going?
from datetime import datetime
tmp_author_data = author_data.drop(['subreddit_embed', 'text_embed'], axis=1)
tmp_author_data = tmp_author_data.assign(**{
    'date_day' : tmp_author_data.loc[:, 'date_day'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
})
from ast import literal_eval
from importlib import reload
import data_helpers
reload(data_helpers)
from data_helpers import assign_date_bin
author_embeddings_data_file = '../../data/reddit_data/author_data/author_date_embeddings_type=subreddit.gz'
author_embeddings_data = pd.read_csv(author_embeddings_data_file, sep='\t', compression='gzip', index_col=False)
embed_var = list(filter(lambda x: x.endswith('_embed'), author_embeddings_data.columns))[0]
author_embeddings_data = author_embeddings_data.assign(**{embed_var : author_embeddings_data.loc[:, embed_var].apply(lambda x: literal_eval(x))})
author_embeddings_data = author_embeddings_data.assign(**{'date_day_bin' : author_embeddings_data.loc[:, 'date_day_bin'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))})
## join w/ author data via date
embedding_date_bins = author_embeddings_data.loc[:, 'date_day_bin'].apply(lambda x: x.timestamp()).unique()
tmp_author_data = tmp_author_data.assign(**{
    'date_day_bin' : tmp_author_data.loc[:, 'date_day'].apply(lambda x: assign_date_bin(x.timestamp(), embedding_date_bins))
})
tmp_author_data = pd.merge(tmp_author_data, author_embeddings_data.loc[:, ['author', 'date_day_bin', embed_var]], on=['author', 'date_day_bin'], how='left')

In [24]:
print(f'{tmp_author_data.dropna(subset=["subreddit_embed"], axis=0).loc[:, "author"].nunique()}/{author_embeddings_data.loc[:, "author"].nunique()} authors retained')

22148/55609 authors retained


In [72]:
# display(tmp_author_data.loc[:, 'date_day_bin'].head())
# display(author_embeddings_data.loc[:, 'date_day_bin'].head())
x = tmp_author_data.loc[:, 'date_day_bin'].iloc[0]
y = author_embeddings_data.loc[:, 'date_day_bin'].iloc[0]
print(x)
print(type(x))
print(y)
print(type(y))

2018-01-01 00:00:00+00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2018-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [68]:
from importlib import reload
import data_helpers
reload(data_helpers)
from data_helpers import assign_date_bin
embedding_date_bin = author_embeddings_data.loc[:, 'date_day_bin'].apply(lambda x: x.timestamp()).unique()
test_date = datetime.strptime('2018-03-01', '%Y-%m-%d')
test_date = test_date.timestamp()
# print(dir(test_date))
test_bin_date = assign_date_bin(test_date, embedding_date_bin)
print(embedding_date_bins)
print(test_bin_date)

[1.5147648e+09 1.5304032e+09 1.5463008e+09 1.5619392e+09]
2018-01-01 00:00:00+00:00


In [78]:
datetime.strptime(test_bin_date.strftime('%Y-%m-%d'), '%Y-%m-%d')

datetime.datetime(2018, 1, 1, 0, 0)

In [80]:
from pytz import utc
test_date_bin = author_embeddings_data.loc[:, 'date_day_bin'].iloc[0]
print(test_date_bin)
print(datetime.fromtimestamp(test_date_bin.timestamp(), tz=utc).replace(tzinfo=None))

2018-01-01 00:00:00
2018-01-01 00:00:00


In [58]:
# from datetime import timezone
from datetime import tzinfo
from datetime import timedelta
# timezone(datetime.timedelta(seceonds=0))
datetime.fromtimestamp(test_date_bin.timestamp())

TypeError: timestamp() takes no keyword arguments

In [60]:
help(datetime.fromtimestamp)

Help on built-in function fromtimestamp:

fromtimestamp(...) method of builtins.type instance
    timestamp[, tz] -> tz's local time from POSIX timestamp.

