# Dataprocessing for Pointer Generator Model

1. Generate a CSV file with headers: content, title.
2. Tokenize data.
3. Process into .bin and vocab files.
4. Lastly, split train.bin, val.bin and test.bin into chunks of 1000 examples per chunk.
The generated bin files (the single or the chunked ones) can be used as input of Pointer-Generator model.

In [2]:
import sys
import os
import hashlib
import struct
import subprocess
import collections
import tensorflow as tf
from tensorflow.core.example import example_pb2
import nltk
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import time
from tqdm import tqdm

In [3]:
wikihow = pd.read_csv("data/clean_wikihow_pointer.csv")
summaries = wikihow['summary'].tolist()
texts = wikihow['text'].tolist()

In [4]:
contents = []
titles = []

# Here index '-1' to remove last ';' from raw text.
for t, s in zip(texts, summaries):
    if (not isinstance(t, float)) and (not isinstance(s, float)):
        contents.append(t[:-1])
        titles.append(s[:-1])

for i in range(2):
    print(titles[i])
    print(contents[i])
    print()

sell yourself first
before doing anything else, stop and sum up yourself as an artist. now, think about how to translate that to an online profile. be it the few words, twitter allows you or an entire page of indulgence that your own website would allow you. bring out the most salient features of your creativity, your experience, your passion, and your reasons for painting. make it clear to readers why you are an artist who loves art, produces high quality art, and is a true champion of art. if you are not great with words, find a friend who can help you with this really important aspect of selling online – the establishment of your credibility and reliability.

read the classics before 1600
reading the classics is the very first thing you have to do to be well-read. if you want to build a solid foundation for your understanding of the books you read, then you cannot avoid some of the earliest plays, poems, and oral tales ever written down. remember that the novel did not really get po

In [5]:
print(len(contents), len(titles))
# range of the dataset
# train: data_size - train_end
# val: data_size - train_end - val_end
# val: train_end - val_end
si, ei = 0, 504000
train_end = 2000
val_end = 1000

1212012 1212012


In [6]:
dic = {'content': contents[si:ei], 'title': titles[si:ei]}
df = pd.DataFrame(dic)
df.to_csv('dataset.csv', index=False)
df.head()

Unnamed: 0,content,title
0,"before doing anything else, stop and sum up yo...",sell yourself first
1,reading the classics is the very first thing y...,read the classics before 1600
2,depending on what scale you intend to sell you...,join online artist communities
3,get yourself out there as best as you can by a...,make yourself public
4,"given the hundreds of free blogging websites, ...",blog about your artwork


## Code used to tokenize and generate bin

In [7]:
dm_single_close_quote = u'\u2019' # unicode
dm_double_close_quote = u'\u201d'
END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence

# We use these to separate the summary sentences in the .bin datafiles
SENTENCE_START = '<s>'
SENTENCE_END = '</s>'

all_train_urls = ""
all_val_urls = ""
all_test_urls = ""

tokenized_stories_dir = "wikihow_tokenized" #location of folder to tokenize text
finished_files_dir = "wikihow_finished_files" #final ouput
chunks_dir = os.path.join(finished_files_dir, "chunked")

VOCAB_SIZE = 200000
CHUNK_SIZE = 1000 # num examples per chunk, for the chunked data


In [8]:
def chunk_file(set_name):
  in_file = finished_files_dir + '/%s.bin' % set_name
  reader = open(in_file, "rb")
  chunk = 0
  finished = False
  while not finished:
    chunk_fname = os.path.join(chunks_dir, '%s_%03d.bin' % (set_name, chunk)) # new chunk
    with open(chunk_fname, 'wb') as writer:
      for _ in range(CHUNK_SIZE):
        len_bytes = reader.read(8)
        if not len_bytes:
          finished = True
          break
        str_len = struct.unpack('q', len_bytes)[0]
        example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
        writer.write(struct.pack('q', str_len))
        writer.write(struct.pack('%ds' % str_len, example_str))
      chunk += 1

def chunk_all():
  # Make a dir to hold the chunks
  if not os.path.isdir(chunks_dir):
    os.mkdir(chunks_dir)
  # Chunk the data
  for set_name in ['train', 'val', 'test']:
    print ("Splitting %s data into chunks..." % set_name)
    chunk_file(set_name)
  print ("Saved chunked data in %s" % chunks_dir)

def tokenize_stories(reviews, tokenized_stories_dir):
  """Maps a whole directory of .story files to a tokenized version using Stanford CoreNLP Tokenizer"""
  for i, row in tqdm(reviews.iterrows(), total=reviews.shape[0]):
        filename = str(i) + '.tok'
        with open(os.path.join(tokenized_stories_dir, filename), 'w', encoding="utf-8") as temp_file:
            text = row["content"]
            tok = nltk.word_tokenize(text)
            tok.append("@highlight")
            Summary = row["title"]
            tok.extend(nltk.word_tokenize(Summary))
            list = tok.copy()

            for i in tok:
                if(i=='``' or i=="''" ):
                    list.remove(i)
            tok_string = "\n".join(str(x) for x in list)
            temp_file.write(tok_string)
  print ("Successfully finished tokenizing to %s .\n" % (tokenized_stories_dir))

def fix_missing_period(line):
  """Adds a period to a line that is missing a period"""
  if "@highlight" in line: return line
  if line=="": return line
  if line[-1] in END_TOKENS: return line
  # print line[-1]
  return line + " ."

def read_text_file(text_file):
  lines = []
  with open(text_file, "r", encoding="utf-8") as f:
    for line in f:
      lines.append(line.strip())
  return lines

def get_art_abs(story_file):
  lines = read_text_file(story_file)

  # Lowercase everything
  lines = [line.lower() for line in lines]

  # Put periods on the ends of lines that are missing them (this is a problem in the dataset because many image captions don't end in periods; consequently they end up in the body of the article as run-on sentences)
  lines = [fix_missing_period(line) for line in lines]

  # Separate out article and abstract sentences
  article_lines = []
  highlights = []
  next_is_highlight = False
  for idx,line in enumerate(lines):
    if line == "":
      continue # empty line
    elif line.startswith("@highlight"):
      next_is_highlight = True
    elif next_is_highlight:
      highlights.append(line)
    else:
      article_lines.append(line)

  # Make article into a single string
  article = ' '.join(article_lines)

  # Make abstract into a signle string, putting <s> and </s> tags around the sentences
  abstract = ' '.join(["%s %s %s" % (SENTENCE_START, sent, SENTENCE_END) for sent in highlights])

  return article, abstract

def write_to_bin(file_names, out_file, makevocab=False):
  """Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file."""
  story_fnames = [str(s)+".tok" for s in file_names]
  num_stories = len(story_fnames)

  if makevocab:
    vocab_counter = collections.Counter()

  with open(out_file, 'wb') as writer:
    for idx,s in enumerate(tqdm(story_fnames)):
      # Look in the tokenized story dirs to find the .story file corresponding to this url
      if os.path.isfile(os.path.join(tokenized_stories_dir, s)):
        story_file = os.path.join(tokenized_stories_dir, s)
      else:
        print ("Error: Couldn't find tokenized story file %s in either tokenized story directory %s. Was there an error during tokenization?" % (s, tokenized_stories_dir))
        # Check again if tokenized stories directories contain correct number of files
        print ("Checking that the tokenized stories directory %s contain correct number of files..." % (tokenized_stories_dir))
        
      # Get the strings to write to .bin file
      article, abstract = get_art_abs(story_file)

      # Write to tf.Example
      tf_example = example_pb2.Example()
      tf_example.features.feature['article'].bytes_list.value.extend([article.encode('utf-8')])
      tf_example.features.feature['abstract'].bytes_list.value.extend([abstract.encode('utf-8')])
      tf_example_str = tf_example.SerializeToString()
      str_len = len(tf_example_str)
      writer.write(struct.pack('q', str_len))
      writer.write(struct.pack('%ds' % str_len, tf_example_str))
   
      # Write the vocab to file, if applicable
      if makevocab:
        art_tokens = article.split(' ')
        abs_tokens = abstract.split(' ')
        abs_tokens = [t for t in abs_tokens if t not in [SENTENCE_START, SENTENCE_END]] # remove these tags from vocab
        tokens = art_tokens + abs_tokens
        tokens = [t.strip() for t in tokens] # strip
        tokens = [t for t in tokens if t!=""] # remove empty
        vocab_counter.update(tokens)
  print ("Finished writing file %s\n" % out_file)

  # write vocab to file
  if makevocab:
    print ("Writing vocab file...")
    with open(os.path.join(finished_files_dir, "vocab"), 'w', encoding="utf-8") as writer:
      for word, count in vocab_counter.most_common(VOCAB_SIZE):
        writer.write(word + ' ' + str(count) + '\n')
    print ("Finished writing vocab file")

def check_num_stories(stories_dir, num_expected):
  num_stories = len(os.listdir(stories_dir))
  if num_stories != num_expected:
    raise Exception("stories directory %s contains %i files but should contain %i" % (stories_dir, num_stories, num_expected))

# Generate tokens

In [9]:
stories_dir =  r"C:\Training\cs224n\proj\sum"
# Create some new directories
if not os.path.exists(tokenized_stories_dir): os.makedirs(tokenized_stories_dir)
if not os.path.exists(finished_files_dir): os.makedirs(finished_files_dir)

#data needed is in a csv format
#containg 2 columbs (content , title)
reviews_csv = stories_dir + "\dataset.csv"
reviews = pd.read_csv(reviews_csv)
reviews = reviews.filter(['content', 'title'])
reviews = reviews.dropna()
reviews = reviews.reset_index(drop=True)

# Run nltk tokenizer on both text and summary , outputting to tokenized stories directories
tokenize_stories(reviews, tokenized_stories_dir)

100%|█████████████████████████████████████████████████████████████████████████| 503999/503999 [42:20<00:00, 198.38it/s]

Successfully finished tokenizing to wikihow_tokenized .






In [10]:
#to get the length of your dataset
num_expected_stories =reviews.shape[0]
all_train_urls = range(0,num_expected_stories-train_end)
all_val_urls = range(num_expected_stories-train_end, num_expected_stories-val_end)
all_test_urls = range(num_expected_stories-val_end,num_expected_stories)

# Read the tokenized stories, do a little postprocessing then write to bin files
write_to_bin(all_test_urls, os.path.join(finished_files_dir, "test.bin"))
write_to_bin(all_val_urls, os.path.join(finished_files_dir, "val.bin"))
write_to_bin(all_train_urls, os.path.join(finished_files_dir, "train.bin"), makevocab=True)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1225.76it/s]
 11%|████████▍                                                                    | 110/1000 [00:00<00:00, 1091.61it/s]

Finished writing file wikihow_finished_files\test.bin



100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1164.54it/s]
  0%|                                                                                       | 0/501999 [00:00<?, ?it/s]

Finished writing file wikihow_finished_files\val.bin



100%|████████████████████████████████████████████████████████████████████████| 501999/501999 [08:03<00:00, 1038.65it/s]


Finished writing file wikihow_finished_files\train.bin

Writing vocab file...
Finished writing vocab file


In [11]:
# Chunk the data. This splits each of train.bin, val.bin and test.bin 
#into smaller chunks, each containing e.g. 1000 examples, and saves them in finished_files/chunks
chunk_all()

Splitting train data into chunks...
Splitting val data into chunks...
Splitting test data into chunks...
Saved chunked data in wikihow_finished_files\chunked
