# Compute data summary statistics
Now that we've organized and cleaned the question data, let's compute some statistics.

In [1]:
# load data
import pandas as pd
# data = pd.read_csv('../../data/reddit_data/combined_data_question_data.gz', sep='\t', compression='gzip')
comment_data = pd.read_csv('../../data/reddit_data/advice_subreddit_filter_comment_question_data.gz', sep='\t', compression='gzip')
post_data = pd.read_csv('../../data/reddit_data/subreddit_submissions_2018-01_2019-12.gz', sep='\t', compression='gzip', index_col=False, usecols=['id', 'selftext'])
post_data.rename(columns={'selftext' : 'article_text', 'id' : 'parent_id'}, inplace=True)
data = pd.merge(comment_data, post_data, on=['parent_id'], how='left')
print(f'{data.shape[0]} total questions')
print(f'{data.loc[:, "parent_id"].nunique()} total posts')

734967 total questions
272467 total posts


## Text statistics

In [2]:
# load tokenizer
# import torch
# model_tokenizer = torch.load('../../data/reddit_data/BART_tokenizer.pt')
from nltk.tokenize import WordPunctTokenizer
word_tokenizer = WordPunctTokenizer()
data = data.assign(**{
    'post_tokens' : data.loc[:, 'article_text'].apply(lambda x: word_tokenizer.tokenize(x)),
    'question_tokens' : data.loc[:, 'question'].apply(lambda x: word_tokenizer.tokenize(x)),
})

In [3]:
import numpy as np
post_len = data.loc[:, "post_tokens"].apply(lambda x: len(x)).values
question_len = data.loc[:, "question_tokens"].apply(lambda x: len(x)).values
print(f'mean post length = {np.mean(post_len)} +/- {np.std(post_len)}')
print(f'mean question length = {np.mean(question_len)} +/- {np.std(question_len)}')

mean post length = 304.5176422887014 +/- 220.76208617123916
mean question length = 13.887307865523214 +/- 8.07912311345707


## Author data
What % of the data includes metadata about authors?

In [4]:
author_data = pd.read_csv('../../data/reddit_data/author_data/combined_author_prior_comment_data.gz', sep='\t', compression='gzip')
display(author_data.head())

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,author,date_day,subreddit,expert_pct,relative_time,expert_pct_bin,relative_time_bin,age,location,location_region,date_day_bin,text_embed,subreddit_embed
0,Battleboy43,2018-06-02,pcmasterrace,0.0,78340.0,0.0,0.0,-1.0,UNK,UNK,2018-01-01,,
1,m4xk0,2018-07-13,pcmasterrace,0.0,69552.0,0.0,0.0,-1.0,UNK,UNK,2018-07-01,,
2,hAbadabadoo22,2019-02-06,AmItheAsshole,0.0,8878.0,0.0,0.0,-1.0,UNK,UNK,2019-01-01,,
3,haysu-christo,2018-11-04,personalfinance,0.0,86011.0,0.0,1.0,-1.0,UNK,UNK,2018-07-01,,
4,haysu-christo,2018-11-05,personalfinance,0.0,74117.0,0.0,0.0,-1.0,UNK,UNK,2018-07-01,,


In [5]:
valid_authors = set(author_data.loc[:, 'author'].unique())
valid_author_data = data[data.loc[:, 'author'].isin(valid_authors)]
print(f'{valid_author_data.shape[0]/data.shape[0]*100}% questions can be connected to an author with some kind of data')

64.78739861789713% questions can be connected to an author with some kind of data


Any difference for discrete and continuous representation?

In [6]:
continuous_vars = ['text_embed', 'subreddit_embed']
valid_continuous_var_authors = set(author_data.dropna(subset=continuous_vars, how='all').loc[:, 'author'].unique())
continuous_var_author_data = data[data.loc[:, 'author'].isin(valid_continuous_var_authors)]
print(f'{continuous_var_author_data.shape[0]/data.shape[0]*100}% questions can be connected to an author with a continuous representation')

8.82883177067814% questions can be connected to an author with a continuous representation


In [19]:
subreddit_embed_data = pd.read_csv('../../data/reddit_data/author_data/author_date_embeddings_type=subreddit.gz', sep='\t', compression='gzip', index_col=False)
subreddit_embed_data.dropna(subset=['subreddit_embed'], axis=0, inplace=True)
subreddit_embed_authors = subreddit_embed_data.loc[:, 'author'].unique()
print(f'{data[data.loc[:, "author"].isin(subreddit_embed_authors)].shape[0]/data.shape[0]*100}% questions can be connected to an author with a subreddit embedding')

8.757672113169706% questions can be connected to an author with a subreddit embedding


In [17]:
embed_author_data.head()

Unnamed: 0,author,date_day,subreddit,expert_pct,relative_time,expert_pct_bin,relative_time_bin,age,location,location_region,date_day_bin,text_embed,subreddit_embed
196,hek14,2019-07-12,AmItheAsshole,0.666667,85815,1,1,-1,UNK,UNK,2019-07-01,"[0.2900071144104004, 0.05066739022731781, -0.0...","[6.103153527299697, -0.2204921501912914, -0.25..."
197,hek14,2019-07-13,legaladvice,0.0,82345,0,1,-1,UNK,UNK,2019-07-01,"[0.2900071144104004, 0.05066739022731781, -0.0...","[6.103153527299697, -0.2204921501912914, -0.25..."
198,hek14,2019-07-25,AmItheAsshole,0.666667,86263,1,1,-1,UNK,UNK,2019-07-01,"[0.2900071144104004, 0.05066739022731781, -0.0...","[6.103153527299697, -0.2204921501912914, -0.25..."
221,IronHorse1776,2018-07-12,legaladvice,0.339623,86137,1,1,-1,UNK,UNK,2018-07-01,"[0.16993024945259094, 0.0948055163025856, -0.1...","[12.64313784030471, 10.702728258281661, 1.8678..."
236,olenavy,2018-11-05,personalfinance,0.62963,69755,1,0,-1,UNK,UNK,2018-07-01,"[0.32217082381248474, 0.43829774856567383, 0.0...","[16.89428571284433, 51.67949200775377, 45.1624..."


In [20]:
embed_author_data = pd.merge(author_data.drop('subreddit_embed', axis=1), subreddit_embed_data, on=['author', 'date_day_bin'], how='outer').dropna(subset=['subreddit_embed'], axis=0)
print(f'{data[data.loc[:, "author"].isin(embed_author_data.loc[:, "author"].unique())].shape[0]/data.shape[0]*100}% questions can be connected to an author with a subreddit embedding')

8.757672113169706% questions can be connected to an author with a subreddit embedding


In [13]:
len(set(subreddit_embed_authors) - set(author_data.loc[:, 'author'].unique()))

8494

### Debug: joining with embedding data

Why are we losing so many of the embeddings?

In [11]:
## where are all the embeddings going?
from datetime import datetime
tmp_author_data = author_data.drop(['subreddit_embed', 'text_embed'], axis=1)
tmp_author_data = tmp_author_data.assign(**{
    'date_day' : tmp_author_data.loc[:, 'date_day'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
})
from ast import literal_eval
from importlib import reload
import data_helpers
reload(data_helpers)
from data_helpers import assign_date_bin
author_embeddings_data_file = '../../data/reddit_data/author_data/author_date_embeddings_type=subreddit.gz'
author_embeddings_data = pd.read_csv(author_embeddings_data_file, sep='\t', compression='gzip', index_col=False)
embed_var = list(filter(lambda x: x.endswith('_embed'), author_embeddings_data.columns))[0]
author_embeddings_data = author_embeddings_data.assign(**{embed_var : author_embeddings_data.loc[:, embed_var].apply(lambda x: literal_eval(x))})
author_embeddings_data = author_embeddings_data.assign(**{'date_day_bin' : author_embeddings_data.loc[:, 'date_day_bin'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))})
## join w/ author data via date
embedding_date_bins = author_embeddings_data.loc[:, 'date_day_bin'].apply(lambda x: x.timestamp()).unique()
tmp_author_data = tmp_author_data.assign(**{
    'date_day_bin' : tmp_author_data.loc[:, 'date_day'].apply(lambda x: assign_date_bin(x.timestamp(), embedding_date_bins))
})
tmp_author_data = pd.merge(tmp_author_data, author_embeddings_data.loc[:, ['author', 'date_day_bin', embed_var]], on=['author', 'date_day_bin'], how='left')

In [24]:
print(f'{tmp_author_data.dropna(subset=["subreddit_embed"], axis=0).loc[:, "author"].nunique()}/{author_embeddings_data.loc[:, "author"].nunique()} authors retained')

22148/55609 authors retained


In [72]:
# display(tmp_author_data.loc[:, 'date_day_bin'].head())
# display(author_embeddings_data.loc[:, 'date_day_bin'].head())
x = tmp_author_data.loc[:, 'date_day_bin'].iloc[0]
y = author_embeddings_data.loc[:, 'date_day_bin'].iloc[0]
print(x)
print(type(x))
print(y)
print(type(y))

2018-01-01 00:00:00+00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2018-01-01 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [68]:
from importlib import reload
import data_helpers
reload(data_helpers)
from data_helpers import assign_date_bin
embedding_date_bin = author_embeddings_data.loc[:, 'date_day_bin'].apply(lambda x: x.timestamp()).unique()
test_date = datetime.strptime('2018-03-01', '%Y-%m-%d')
test_date = test_date.timestamp()
# print(dir(test_date))
test_bin_date = assign_date_bin(test_date, embedding_date_bin)
print(embedding_date_bins)
print(test_bin_date)

[1.5147648e+09 1.5304032e+09 1.5463008e+09 1.5619392e+09]
2018-01-01 00:00:00+00:00


In [78]:
datetime.strptime(test_bin_date.strftime('%Y-%m-%d'), '%Y-%m-%d')

datetime.datetime(2018, 1, 1, 0, 0)

In [80]:
from pytz import utc
test_date_bin = author_embeddings_data.loc[:, 'date_day_bin'].iloc[0]
print(test_date_bin)
print(datetime.fromtimestamp(test_date_bin.timestamp(), tz=utc).replace(tzinfo=None))

2018-01-01 00:00:00
2018-01-01 00:00:00


In [58]:
# from datetime import timezone
from datetime import tzinfo
from datetime import timedelta
# timezone(datetime.timedelta(seceonds=0))
datetime.fromtimestamp(test_date_bin.timestamp())

TypeError: timestamp() takes no keyword arguments

In [60]:
help(datetime.fromtimestamp)

Help on built-in function fromtimestamp:

fromtimestamp(...) method of builtins.type instance
    timestamp[, tz] -> tz's local time from POSIX timestamp.

