# Compute data summary statistics
Now that we've organized and cleaned the question data, let's compute some statistics.

In [9]:
# load data
import pandas as pd
data = pd.read_csv('../../data/reddit_data/combined_data_question_data.gz', sep='\t', compression='gzip')
print(f'{data.shape[0]} total questions')
print(f'{data.loc[:, "article_id"].nunique()} total posts')

170418 total questions
92934 total posts


## Text statistics

In [2]:
data.columns

Index(['article_id', 'article_text', 'title', 'created_utc', 'subreddit', 'id',
       'question', 'author'],
      dtype='object')

In [6]:
# load tokenizer
# import torch
# model_tokenizer = torch.load('../../data/reddit_data/BART_tokenizer.pt')
from nltk.tokenize import WordPunctTokenizer
word_tokenizer = WordPunctTokenizer()
data = data.assign(**{
    'post_tokens' : data.loc[:, 'article_text'].apply(lambda x: word_tokenizer.tokenize(x)),
    'question_tokens' : data.loc[:, 'question'].apply(lambda x: word_tokenizer.tokenize(x)),
})

In [7]:
import numpy as np
post_len = data.loc[:, "post_tokens"].apply(lambda x: len(x)).values
question_len = data.loc[:, "question_tokens"].apply(lambda x: len(x)).values
print(f'mean post length = {np.mean(post_len)} +/- {np.std(post_len)}')
print(f'mean question length = {np.mean(question_len)} +/- {np.std(question_len)}')

mean post length = 342.83490593716624 +/- 227.4783819969454
mean question length = 14.729934631318288 +/- 8.995959519572068


## Author data
What % of the data includes metadata about authors?

In [10]:
author_data = pd.read_csv('../../data/reddit_data/author_data/combined_author_prior_comment_data.gz', sep='\t', compression='gzip')
display(author_data.head())

Unnamed: 0,author,date_day,subreddit,expert_pct,relative_time,expert_pct_bin,relative_time_bin,age,location,location_region,date_day_bin,text_embed,subreddit_embed
0,Battleboy43,2018-06-02,pcmasterrace,0.0,78340,0,0,-1,UNK,UNK,2018-01-01,,
1,m4xk0,2018-07-13,pcmasterrace,0.0,69552,0,0,-1,UNK,UNK,2018-07-01,,
2,hAbadabadoo22,2019-02-06,AmItheAsshole,0.0,8878,0,0,-1,UNK,UNK,2019-01-01,,
3,haysu-christo,2018-11-04,personalfinance,0.0,86011,0,1,-1,UNK,UNK,2018-07-01,,
4,haysu-christo,2018-11-05,personalfinance,0.0,74117,0,0,-1,UNK,UNK,2018-07-01,,


In [12]:
valid_authors = set(author_data.loc[:, 'author'].unique())
valid_author_data = data[data.loc[:, 'author'].isin(valid_authors)]
print(f'{valid_author_data.shape[0]/data.shape[0]*100}% questions can be connected to an author with some kind of data')

43.24015068830757% questions can be connected to an author with some kind of data
