In [2]:
import gzip
import hashlib
import base64
import csv
from tqdm import tqdm
import pandas as pd
import random

In [5]:
# Code from deepmind repo to read gzipped files and output hashes, dates, and base64 text
def get_year_hashes(year):
  with gzip.open("./WMTdata/zipped/news-docs." + str(year) + ".en.filtered.gz", 'rb') as gz_file:
    for line in gz_file:
      date, sentence_split_text, unsplit_text = line.decode('utf-8').strip().split('\t')
      docid = hashlib.sha256(unsplit_text.encode('utf-8')).hexdigest()
      yield docid, (date, sentence_split_text, unsplit_text)

In [6]:
# Decode base64 encoded text for the given years and output in csv files with date and docid
years = [2012, 2019, 2020, 2021]
for year in years:
  year_gen = get_year_hashes(year)
  # Save a csv with docid, date, and sentence split text cols
  with open("./WMTdata/decoded_splits_" + str(year) + ".csv", "w") as splits_csv:
    splits_writer = csv.writer(splits_csv, delimiter="\t")
    splits_writer.writerow(["docid", "date", "sentence_split_text"])
    for docid, (date, sentence_split_text, unsplit_text) in tqdm(year_gen):
      decoded_text = base64.b64decode(sentence_split_text)
      splits_writer.writerow([docid, date, decoded_text.decode().replace("\n", " ")])

758958it [01:18, 9674.68it/s] 
1258408it [02:43, 7685.48it/s] 
1606654it [03:30, 7636.96it/s] 
1555033it [03:15, 7955.33it/s] 


In [64]:
# Create pandas dataframe with all entries from the training data date range
splits = []
for year in tqdm([2014, 2015, 2016, 2017]):
  filepath = "./WMTdata/decoded_splits_" + str(year) + ".csv"
  year_splits = pd.read_csv(filepath, delimiter="\t", header=0, index_col=0)
  splits.append(year_splits)
splits = pd.concat(splits)

In [None]:
# Create training set from provided deepmind hashes
with open("./WMTdata/train_splits.txt", "r") as train_hashes:
  with open("./WMTdata/train.txt", "w") as train_file:
    for line in train_hashes:
      cur_hash = line.split(" ")[1]
      hash_text = splits[cur_hash]["sentence_split_text"]
      train_file.writerow(hash_text)

In [3]:
rand = random.Random(123)

# Create a train data text file from a single year's csv splits
def splits_to_text(year, max_articles):
  fp = "./WMTdata/decoded_splits_" + str(year) + ".csv"
  year_df = pd.read_csv(fp, delimiter="\t", header=0, index_col=0)
  texts = list(year_df.sentence_split_text)
  rand.shuffle(texts)
  print("Total number of texts for year " + str(year) + ": " + str(len(texts)))

  count = 0
  with open("./WMTdata/text_" + str(year) + ".txt", "w") as train_year_file:
    for article in tqdm(texts):
      train_year_file.write("<s>" + article + "</s>")
      count += 1
      if count > max_articles:
        break

In [None]:
for year in range(2012, 2022):
  splits_to_text(year, 200000)

In [None]:
import numpy as np

# Since my first run failed after training on 2012
fname = "./metrics/train-year-2012"
#{'train_runtime': 7833.3512, 'train_samples_per_second': 6.778, 'train_steps_per_second': 0.053, 'train_loss': 3.276659389624849, 'epoch': 1.0}
metrics = {2012: {'eval_loss': 3.1113810539245605, 'eval_runtime': 266.36, 'eval_samples_per_second': 19.827, 'eval_steps_per_second': 1.243, 'epoch': 1.0},
2013: {'eval_loss': 3.2066714763641357, 'eval_runtime': 299.3471, 'eval_samples_per_second': 19.88, 'eval_steps_per_second': 1.243, 'epoch': 1.0},
2014: {'eval_loss': 3.1979284286499023, 'eval_runtime': 320.473, 'eval_samples_per_second': 19.874, 'eval_steps_per_second': 1.245, 'epoch': 1.0},
2015: {'eval_loss': 3.1861016750335693, 'eval_runtime': 326.2901, 'eval_samples_per_second': 19.875, 'eval_steps_per_second': 1.244, 'epoch': 1.0},
2016: {'eval_loss': 3.1606438159942627, 'eval_runtime': 340.249, 'eval_samples_per_second': 19.882, 'eval_steps_per_second': 1.243, 'epoch': 1.0},
2017: {'eval_loss': 3.324777364730835, 'eval_runtime': 391.9027, 'eval_samples_per_second': 19.877, 'eval_steps_per_second': 1.243, 'epoch': 1.0},
2018: {'eval_loss': 3.3093581199645996, 'eval_runtime': 379.2926, 'eval_samples_per_second': 19.876, 'eval_steps_per_second': 1.244, 'epoch': 1.0},
2019: {'eval_loss': 3.247542142868042, 'eval_runtime': 340.0459, 'eval_samples_per_second': 19.883, 'eval_steps_per_second': 1.244, 'epoch': 1.0},
2020: {'eval_loss': 3.272118330001831, 'eval_runtime': 341.6526, 'eval_samples_per_second': 19.883, 'eval_steps_per_second': 1.244, 'epoch': 1.0},
2021: {'eval_loss': 3.272197723388672, 'eval_runtime': 334.848, 'eval_samples_per_second': 19.881, 'eval_steps_per_second': 1.245, 'epoch': 1.0}}

np.save(fname, metrics)

In [None]:
import nltk
with open("./WMTdata/text_2017_100000.txt", "r") as train_year_file:
  text = train_year_file.readlines()[0]
  tokens = nltk.word_tokenize(text)

print(len(tokens))