In [18]:
import numpy as np
import os
import re
from collections import defaultdict

In [41]:
def gen_data_and_vocab():
  def collect_data_from(parent_path, newsgroup_list, word_count = None):
    data = []
    for group_id, newsgroup in enumerate(newsgroup_list):
      dir_path = parent_path + '\\' + newsgroup + '\\'

      files = [(filename, dir_path + filename) for filename in os.listdir(dir_path) if os.path.isfile(dir_path + filename)]

      files.sort()
      label = group_id
      print('Processing: {}-{}'.format(group_id, newsgroup))

      for file_name, filepath in files:
        with open(filepath) as f:
          text = f.read().lower()
          words = re.split('\W+', text)
          if word_count is not None:
            for word in words:
              word_count[word] += 1
          content = ' '.join(words)
          assert len(content.splitlines()) == 1
          data.append(str(label) + '<fff>' + file_name + '<fff>' + content)
    return data

  word_count = defaultdict(int)
  path = '..\\datasets\\20news-bydate\\'
  parts = [path + dirname + '\\' for dirname in os.listdir(path) if not os.path.isfile(path + dirname)]

  train_path, test_path = (parts[0], parts[1]) if 'train' in parts[0] else (parts[1], parts[0])

  newsgroup_list = os.listdir(train_path)
  newsgroup_list.sort()

  train_data = collect_data_from(
    parent_path = train_path,
    newsgroup_list = newsgroup_list,
    word_count = word_count
  )

  vocab = [word for word, freq in zip(word_count.keys(), word_count.values()) if freq > 10]
  vocab.sort()

  test_data = collect_data_from(
    parent_path = test_path,
    newsgroup_list= newsgroup_list
  )

  if not os.path.exists(path + 'w2v'):
    os.makedirs(path + 'w2v')

  with open(path + 'w2v\\vocab-raw.txt', 'w') as f:
    f.write('\n'.join(vocab))

  with open(path + 'w2v\\20news-train-raws.txt', 'w') as f:
    f.write('\n'.join(train_data))

  with open (path + 'w2v\\20news-test-raws.txt', 'w') as f:
    f.write('\n'.join(test_data))

In [42]:
gen_data_and_vocab()

Processing: 0-alt.atheism
Processing: 1-comp.graphics
Processing: 2-comp.os.ms-windows.misc
Processing: 3-comp.sys.ibm.pc.hardware
Processing: 4-comp.sys.mac.hardware
Processing: 5-comp.windows.x
Processing: 6-misc.forsale
Processing: 7-rec.autos
Processing: 8-rec.motorcycles
Processing: 9-rec.sport.baseball
Processing: 10-rec.sport.hockey
Processing: 11-sci.crypt
Processing: 12-sci.electronics
Processing: 13-sci.med
Processing: 14-sci.space
Processing: 15-soc.religion.christian
Processing: 16-talk.politics.guns
Processing: 17-talk.politics.mideast
Processing: 18-talk.politics.misc
Processing: 19-talk.religion.misc
Processing: 0-alt.atheism
Processing: 1-comp.graphics
Processing: 2-comp.os.ms-windows.misc
Processing: 3-comp.sys.ibm.pc.hardware
Processing: 4-comp.sys.mac.hardware
Processing: 5-comp.windows.x
Processing: 6-misc.forsale
Processing: 7-rec.autos
Processing: 8-rec.motorcycles
Processing: 9-rec.sport.baseball
Processing: 10-rec.sport.hockey
Processing: 11-sci.crypt
Processing

In [12]:
MAX_DOC_LENGTH = 500

In [23]:
def encode_data(data_path, vocab_path):
    unknown_ID = 1
    padding_ID = 0
    with open(vocab_path) as f:
        vocab = dict([(word, word_id + 2) for word_id, word in enumerate(f.read().splitlines())])

    with open(data_path) as f:
        documents = [(line.split('<fff>')[0], line.split('<fff>')[1], line.split('<fff>')[2]) for line in f.read().splitlines()]
    
    encoded_data = []
    
    for document in documents:
        label, doc_id, text = document
        words = text.split()[:MAX_DOC_LENGTH]
        sentence_length = len(words)

        encoded_text = []
        for word in words:
            if word in vocab:
                encoded_text.append(str(vocab[word]))
            else:
                encoded_text.append(str(unknown_ID))
        
        if sentence_length < MAX_DOC_LENGTH:
            num_padding = MAX_DOC_LENGTH - sentence_length
            for _ in range(num_padding):
                encoded_text.append(str(padding_ID))

        encoded_data.append(str(label) + '<fff>' + str(doc_id) + '<fff>' + str(sentence_length) + '<fff>' + ' '.join(encoded_text))

    dir_name = '\\'.join(data_path.split('\\')[:-1])
    file_name = '-'.join(data_path.split('\\')[-1].split('-')[:-1]) + '-encoded.txt'

    with open(dir_name + '\\' + file_name, 'w') as f:
        f.write('\n'.join(encoded_data))

In [24]:
data_path = '..\\datasets\\20news-bydate\\w2v\\'
vocab_path = '..\\datasets\\20news-bydate\\w2v\\vocab-raw.txt'
encode_data(data_path = data_path + '20news-train-raws.txt', vocab_path = vocab_path)
encode_data(data_path = data_path + '20news-test-raws.txt', vocab_path = vocab_path)
