# **Dataset Preprocessing**

## **Imports**

In [None]:
import json
import re 
import random
import csv
import numpy as np
import tensorflow as tf
from gensim.models import Word2Vec
from tqdm.auto import tqdm 
import contractions

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

## **Dataset**

The dataset containing crawled english news data from 2019 provided by the WMT21 conference can be found here: https://data.statmt.org/news-crawl/en/ 


In [None]:
# If dataset is saved on google drive you can mount your cloud storage here
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Edit this variable when your dataset is saved locally 
path = "YourPathHere"

Load the dataset:

In [None]:
news = []

with open(f"{path}news_data.deduped", encoding="utf-8") as f:
    with tqdm(total=33600797) as pbar:
        for line in f:
            news.append(line)
            pbar.update(1)

del(news[19195150])

print(f"Remaining sentences: {len(news)}")

Expand all contractions and split sentences:

In [None]:
news_sent = []
for line in tqdm(news):
    news_sent.extend(nltk.sent_tokenize(contractions.fix(line, slang=False)))

print(f"Remaining sentences: {len(news_sent)}")

Preprocessing using regex:


In [None]:
# hyperlinks
news_cleaned = [re.sub(r'[\'’"“”]|http\S+|\n', '', sent) for sent in tqdm(news_sent)]

# multiple occurences of . - — or *
news_cleaned = [re.sub(r'\.\.+|—+|-+|\*\*+', ' ',sent) for sent in tqdm(news_cleaned)]

# digits
news_cleaned = [re.sub(r'\d+([\.,-:]?\d+)*', ' <NUM> ', sent) for sent in tqdm(news_cleaned)]

# USA/US        
news_cleaned = [re.sub(r'usa\b|USA\b|U\.S\.A\.|u\.s\.a\.', 'Platzhalter', sent) for sent in tqdm(news_cleaned)]
news_cleaned = [re.sub(r'US\b|U\.S\.|u\.s\.|U\.s\.', 'U.S ', sent) for sent in tqdm(news_cleaned)]
news_cleaned = [re.sub(r'Platzhalter', 'U.S.A ', sent) for sent in tqdm(news_cleaned)]

print("Example sentences: ")
news_cleaned[5985:5990]

Apply nltk's word tokenizer to the lowercased sentences:

In [None]:
news_tokenized = [word_tokenize(sent.lower()) for sent in tqdm(news_cleaned)]

print(f"Remaining sentences: {len(news_tokenized)}")

Intermediate save to free up memory:

In [None]:
with open(f"{path}news_data_preprocessed_cache.csv", "w", encoding='utf8', newline="") as output_file:
    writer = csv.writer(output_file)
    writer.writerows(news_tokenized)

Load the intermediate save and create a token frequency dict. Sentences shorter than 5 tokens are removed:

In [None]:
news_tokenized = []
freqs = {}

with open(f"{path}news_data_preprocessed_cache.csv", encoding='utf-8', newline="") as f:
    reader = csv.reader(f)
    with tqdm(total=43989540) as pbar:
        for sent in reader:
            if len(sent) > 5:
                news_tokenized.append(sent)
                for word in sent:
                    freqs[word] = freqs.get(word, 0) + 1
            pbar.update(1)

print(f"Number of tokens: {len(freqs)}")

Save the freq dict: 

In [None]:
with open(f'{path}freq_dict.csv', 'w', encoding='utf8', newline="") as output_file:
    w = csv.DictWriter(output_file, freqs.keys())
    w.writeheader()
    w.writerow(freqs)  

Removing sentences that contain words which occur less than 10000 times:

In [None]:
remove=False
cache = []

for sent in tqdm(news_tokenized):
    for word in sent:
        if freqs[word]<=10000:
            remove=True
    if remove == False:
        cache.append(sent)
    remove=False 


vocab_set = set()
for sent in cache:
    vocab_set = vocab_set.union(set(sent))

print(f"Remaining sentences: {len(cache)}")
print(f"Remaining vocab: {len(vocab_set)}")

Intermediate save to free up memory:

In [None]:
with open(f"{path}news_data_preprocessed_voc_6826.csv", "w", encoding='utf8', newline="") as output_file:
    writer = csv.writer(output_file)
    writer.writerows(cache)

Load the intermediate save:

In [None]:
with open(f"{path}news_data_preprocessed_voc_6826.csv", encoding='utf-8', newline="") as file:
    reader = csv.reader(file)
    news_tokenized = list(reader)

Set a seed to make results comparable.
Shuffle the dataset once, to obtain random train and test partitions later:

In [None]:
random.seed(69)

random.shuffle(news_tokenized)

news_tokenized = news_tokenized[:750000]

Final save:




In [None]:
with open(f"{path}news_data_preprocessed.csv", "w", encoding='utf8', newline="") as output_file:
    writer = csv.writer(output_file)
    writer.writerows(news_tokenized)