In [1]:
# !pip install -q tfds-nightly tensorflow matplotlib

In [23]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pandas as pd



In [3]:
# !pip install tensorflow-datasets

In [4]:
import tensorflow_datasets as tfds

In [7]:
ds, info = tfds.load('reddit_tifu/long', split='train', with_info=True)

In [8]:
print(info)

tfds.core.DatasetInfo(
    name='reddit_tifu',
    full_name='reddit_tifu/long/1.1.0',
    description="""
    Reddit dataset, where TIFU denotes the name of subbreddit /r/tifu.
    As defined in the publication, styel "short" uses title as summary and
    "long" uses tldr as summary.
    
    Features includes:
      - document: post text without tldr.
      - tldr: tldr line.
      - title: trimmed title without tldr.
      - ups: upvotes.
      - score: score.
      - num_comments: number of comments.
      - upvote_ratio: upvote ratio.
    """,
    config_description="""
    Using TLDR as summary.
    """,
    homepage='https://github.com/ctr4si/MMN',
    data_path='C:\\Users\\lieuw\\tensorflow_datasets\\reddit_tifu\\long\\1.1.0',
    download_size=639.54 MiB,
    dataset_size=92.38 MiB,
    features=FeaturesDict({
        'documents': Text(shape=(), dtype=tf.string),
        'num_comments': tf.float32,
        'score': tf.float32,
        'title': Text(shape=(), dtype=tf.string),


In [9]:
info.features

FeaturesDict({
    'documents': Text(shape=(), dtype=tf.string),
    'num_comments': tf.float32,
    'score': tf.float32,
    'title': Text(shape=(), dtype=tf.string),
    'tldr': Text(shape=(), dtype=tf.string),
    'ups': tf.float32,
    'upvote_ratio': tf.float32,
})

In [10]:
print(list(info.splits.keys()))

['train']


In [11]:
dataframe = tfds.as_dataframe(ds, info)

In [33]:
dataframe.head(1)

Unnamed: 0,documents,num_comments,score,title,tldr,ups,upvote_ratio
0,"b""me and a friend decided to go to the beach l...",1.0,8.0,b'liking seafood',b'had delicious seafood. almost flooded a toil...,8.0,0.76


In [34]:
compression_opts = dict(method='zip', archive_name='out.csv') 
dataframe.to_csv('out.zip', index=True, compression=compression_opts) 

In [16]:
dataframe[1:2].documents

1    b'obligatory this happened last thursday. \n\n...
Name: documents, dtype: object


In [18]:
import json

# Read entire file
posts = []
with open('tifu_all_tokenized_and_filtered.json', 'r') as fp:
    for line in fp:
        posts.append(json.loads(line))

# Json entries
print(posts[50000].keys())
# [u'title_tokenized',
#  u'permalink',
#  u'title',
#  u'url',
#  u'num_comments',
#  u'tldr',  # (optional)
#  u'created_utc',
#  u'trimmed_title_tokenized',
#  u'ups',
#  u'selftext_html',
#  u'score',
#  u'upvote_ratio',
#  u'tldr_tokenized',  # (optional)
#  u'selftext',
#  u'trimmed_title',
#  u'selftext_without_tldr_tokenized',
#  u'id',
#  u'selftext_without_tldr']

dict_keys(['title_tokenized', 'permalink', 'title', 'url', 'num_comments', 'tldr', 'created_utc', 'trimmed_title_tokenized', 'id', 'selftext_html', 'score', 'upvote_ratio', 'tldr_tokenized', 'selftext', 'trimmed_title', 'selftext_without_tldr_tokenized', 'ups', 'selftext_without_tldr'])


In [21]:
type(posts[1])

dict

In [24]:
df = pd.DataFrame(posts)

In [31]:
df.head(2)

Unnamed: 0,title_tokenized,permalink,title,url,num_comments,tldr,created_utc,trimmed_title_tokenized,id,selftext_html,score,upvote_ratio,selftext,trimmed_title,selftext_without_tldr_tokenized,ups,selftext_without_tldr,tldr_tokenized
0,"[tifu, by, forgetting, to, pull, my, underwear...",/r/tifu/comments/1ghd5r/tifu_by_forgetting_to_...,TIFU by forgetting to pull my underwear down b...,https://www.reddit.com/r/tifu/comments/1ghd5r/...,13,,1371426000.0,"[forgetting, to, pull, my, underwear, down, be...",1ghd5r,"<!-- SC_OFF --><div class=""md""><p>I was on Sky...",50,0.77,I was on Skype on my tablet as I went to the t...,forgetting to pull my underwear down before i ...,"[i, was, on, skype, on, my, tablet, as, i, wen...",50,i was on skype on my tablet as i went to the t...,
1,"[tifu, by, gender, stereotyping]",/r/tifu/comments/1ggydk/tifu_by_genderstereoty...,TIFU by gender-stereotyping,https://www.reddit.com/r/tifu/comments/1ggydk/...,23,confuse a 5th grade girl for a boy in front o...,1371412000.0,"[gender, stereotyping]",1ggydk,"<!-- SC_OFF --><div class=""md""><p>This actuall...",115,0.88,This actually happened a couple of years ago. ...,gender-stereotyping,"[this, actually, happened, a, couple, of, year...",115,this actually happened a couple of years ago. ...,"[confuse, a, 0th, grade, girl, for, a, boy, in..."


In [29]:
df['tldr'].isna().sum()

36965