In [1]:
import tools
import re
import pandas as pd
import torch
import model

FILE_PATH = "./data/"
MODEL_PATH = './model/'

In [2]:
def read_agnews(file_path):
    index_name = ['label', 'title', 'content']
    agnews_train = pd.read_csv(file_path + "train.csv", names=index_name)
    agnews_test = pd.read_csv(file_path + "test.csv", names=index_name)
    all_news = pd.concat([agnews_train, agnews_test])
    data = [[], [], []]
    for label, title, content in all_news.values:
        data[0].append(label)
        data[1].append(re.sub('[^A-Za-z]+', ' ', title).strip().lower())
        data[2].append(re.sub('[^A-Za-z]+', ' ', content).strip().lower())
    return data[0], data[1], data[2]

In [16]:
label, title, content = read_agnews(FILE_PATH)
content = tools.tokenize(content)
content[:10]

[['reuters',
  'short',
  'sellers',
  'wall',
  'street',
  's',
  'dwindling',
  'band',
  'of',
  'ultra',
  'cynics',
  'are',
  'seeing',
  'green',
  'again'],
 ['reuters',
  'private',
  'investment',
  'firm',
  'carlyle',
  'group',
  'which',
  'has',
  'a',
  'reputation',
  'for',
  'making',
  'well',
  'timed',
  'and',
  'occasionally',
  'controversial',
  'plays',
  'in',
  'the',
  'defense',
  'industry',
  'has',
  'quietly',
  'placed',
  'its',
  'bets',
  'on',
  'another',
  'part',
  'of',
  'the',
  'market'],
 ['reuters',
  'soaring',
  'crude',
  'prices',
  'plus',
  'worries',
  'about',
  'the',
  'economy',
  'and',
  'the',
  'outlook',
  'for',
  'earnings',
  'are',
  'expected',
  'to',
  'hang',
  'over',
  'the',
  'stock',
  'market',
  'next',
  'week',
  'during',
  'the',
  'depth',
  'of',
  'the',
  'summer',
  'doldrums'],
 ['reuters',
  'authorities',
  'have',
  'halted',
  'oil',
  'export',
  'flows',
  'from',
  'the',
  'main',
  'pipe

In [17]:
vocab = tools.Vocab(content)
f'vocab size: {len(vocab)}'

'vocab size: 59173'

In [20]:
subsampled, counter = tools.subsample(content, vocab)
subsampled[:10]

[['reuters', 'short', 'sellers', 'wall', 'street', 's', 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], ['reuters', 'private', 'investment', 'firm', 'carlyle', 'group', 'which', 'has', 'a', 'reputation', 'for', 'making', 'well', 'timed', 'and', 'occasionally', 'controversial', 'plays', 'in', 'the', 'defense', 'industry', 'has', 'quietly', 'placed', 'its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market'], ['reuters', 'soaring', 'crude', 'prices', 'plus', 'worries', 'about', 'the', 'economy', 'and', 'the', 'outlook', 'for', 'earnings', 'are', 'expected', 'to', 'hang', 'over', 'the', 'stock', 'market', 'next', 'week', 'during', 'the', 'depth', 'of', 'the', 'summer', 'doldrums'], ['reuters', 'authorities', 'have', 'halted', 'oil', 'export', 'flows', 'from', 'the', 'main', 'pipeline', 'in', 'southern', 'iraq', 'after', 'intelligence', 'showed', 'a', 'rebel', 'militia', 'could', 'strike', 'infrastructure', 'an', 'oil', 'official', 'said', 'on', 'satur

[['short', 'street', 'dwindling', 'ultra'],
 [],
 ['depth', 'doldrums'],
 ['halted', 'export', 'militia'],
 ['tearaway', 'menace'],
 ['slightly'],
 ['funds', 'by'],
 ['usatoday', 'economy', 'midsummer'],
 ['ph',
  'sociology',
  'bazil',
  'riley',
  'estate',
  'at',
  'planner',
  'brochures',
  'furthest',
  'mind'],
 ['cynics']]

In [21]:
corpus = [vocab[line] for line in subsampled]
corpus[:3]

[[724, 335, 9999, 5808], [], [6053, 14765]]

In [22]:
all_centers, all_contexts = tools.get_centers_and_contexts(corpus, 5)

In [23]:
all_negatives = tools.get_negatives(all_contexts, vocab, counter, 5)