In [7]:

from matplotlib_inline import backend_inline
from matplotlib import pyplot as plt
from IPython import get_ipython
from IPython.display import display
import torch
import random
import re
import collections
import inspect
import os
import hashlib # Added missing import
import zipfile # Added missing import
import tarfile # Added missing import
import requests # Import the requests library

In [8]:

def use_svg_display():
    """Use the svg format to display a plot in Jupyter.

    Defined in :numref:`sec_calculus`"""
    backend_inline.set_matplotlib_formats('svg')
def set_figsize(figsize=(3.5,2.5)):
    use_svg_display()
    plt.rcParams['figure.figsize']=figsize
def set_axes(axes,xlabel,ylabel,xlim,ylim,xscale,yscale,legend):
    axes.set_xlabel(xlabel), axes.set_ylabel(ylabel)
    axes.set_xscale(xscale), axes.set_yscale(yscale)
    axes.set_xlim(xlim), axes.set_ylim(ylim)
    if legend:
        axes.legend(legend)
    axes.grid()
def plot(X,Y=None,xlabel=None, ylabel=None, legend=[],xlim=None, ylim=None, xscale='linear',yscale='linear',fmts=('-','m--','g-.','r:'),figsize=(3.5,2.5),axes=None):
    def has_one_axis(X):
        return(hasattr(X,'ndim') and X.ndim == 1 or isinstance(X,list) and not hasattr(X[0],"__len__"))
    if has_one_axis(X): X=[X]
    if Y is None:
        X,Y=[[]]*len(X),X
    elif has_one_axis(Y):
        Y=[Y]
    if len(X) != len(Y):
        X=X*len(Y)
    set_figsize(figsize)
    if axes is None:
        axes=plt.gca()
    axes.cla()
    for x,y,fmt in zip (X,Y,fmts):
        axes.plot(x,y,fmt) if len(x) else axes.plot(y,fmt)
    set_axes(axes,xlabel,ylabel,xlim,ylim,xscale,yscale,legend)

class HyperParameters:
    """The base class of hyperparameters."""
    def save_hyperparameters(self, ignore=[]):
        """Defined in :numref:`sec_oo-design`"""
        raise NotImplemented

    def save_hyperparameters(self, ignore=[]):
        """Save function arguments into class attributes.

        Defined in :numref:`sec_utils`"""
        frame = inspect.currentframe().f_back
        _, _, _, local_vars = inspect.getargvalues(frame)
        self.hparams = {k:v for k, v in local_vars.items()
                        if k not in set(ignore+['self']) and not k.startswith('_')}
        for k, v in self.hparams.items():
            setattr(self, k, v)

class ProgressBoard(HyperParameters):
    """The board that plots data points in animation.

    Defined in :numref:`sec_oo-design`"""
    def __init__(self, xlabel=None, ylabel=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 ls=['-', '--', '-.', ':'], colors=['C0', 'C1', 'C2', 'C3'],
                 fig=None, axes=None, figsize=(3.5, 2.5), display=True):
        self.save_hyperparameters()

    def draw(self, x, y, label, every_n=1):
        raise NotImplemented

    def draw(self, x, y, label, every_n=1):
        """Defined in :numref:`sec_utils`"""
        Point = collections.namedtuple('Point', ['x', 'y'])
        if not hasattr(self, 'raw_points'):
            self.raw_points = collections.OrderedDict()
            self.data = collections.OrderedDict()
        if label not in self.raw_points:
            self.raw_points[label] = []
            self.data[label] = []
        points = self.raw_points[label]
        line = self.data[label]
        points.append(Point(x, y))
        if len(points) != every_n:
            return
        mean = lambda x: sum(x) / len(x)
        line.append(Point(mean([p.x for p in points]),
                          mean([p.y for p in points])))
        points.clear()
        if not self.display:
            return
        use_svg_display()
        if self.fig is None:
            self.fig = plt.figure(figsize=self.figsize)
        plt_lines, labels = [], []
        for (k, v), ls, color in zip(self.data.items(), self.ls, self.colors):
            plt_lines.append(plt.plot([p.x for p in v], [p.y for p in v],
                                          linestyle=ls, color=color)[0])
            labels.append(k)
        axes = self.axes if self.axes else plt.gca()
        if self.xlim: axes.set_xlim(self.xlim)
        if self.ylim: axes.set_ylim(self.ylim)
        if not self.xlabel: self.xlabel = self.x
        axes.set_xlabel(self.xlabel)
        axes.set_ylabel(self.ylabel)
        axes.set_xscale(self.xscale)
        axes.set_yscale(self.yscale)
        axes.legend(plt_lines, labels)
        display.display(self.fig)
        display.clear_output(wait=True)


In [9]:
def download(url, folder='../data', sha1_hash=None):
    """Download a file to folder and return the local filepath."""
    if not url.startswith('http'):
    # For back compatability
        url, sha1_hash = DATA_HUB[url]
    os.makedirs(folder, exist_ok=True)
    fname = os.path.join(folder, url.split('/')[-1])
    # Check if hit cache
    if os.path.exists(fname) and sha1_hash:
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
            sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname
    # Download
    print(f'Downloading {fname} from {url}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname
def extract(filename, folder=None):
    """Extract a zip/tar file into folder."""
    base_dir = os.path.dirname(filename)
    _, ext = os.path.splitext(filename)
    assert ext in ('.zip', '.tar', '.gz'), 'Only support zip/tar files.'
    if ext == '.zip':
        fp = zipfile.ZipFile(filename, 'r')
    else:
        fp = tarfile.open(filename, 'r')
    if folder is None:
        folder = base_dir
    fp.extractall(folder)

In [10]:
class DataModule(HyperParameters):
    def __init__(self, root='./data', num_workers=4):
        self.save_hyperparameters()
    def get_dataloader(self,train):
        i=slice(0,self.num_train) if train else slice(self.num_train, None)
        return self.get_tensorloader((self.X, self.y), train, i)
    def train_dataloader(self):
        return self.get_dataloader(train=True)
    def val_dataloader(self):
        return self.get_dataloader(train=False)
    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train)

In [11]:
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

class TimeMachine(DataModule):
  def _download(self):
    fname=download(DATA_URL+ 'timemachine.txt', self.root,'090b5e7e70c295757f55df93cb0a180b9691891a')
    with open(fname) as f:
      return f.read()
  def _preprocess(self,text):
    return re.sub('[^A-Za-z]+',' ',text).lower()
  def _tokenize(self,text):
    return list(text)
  def build(self, raw_text, vocab=None):
    tokens=self._tokenize(self._preprocess(raw_text))
    if vocab is None: vocab=Vocab(tokens)
    corpus=[vocab[token] for token in tokens]
    return corpus, vocab
  def __init__(self, batch_size, num_steps, num_train=10000, num_val=5000):
    super(TimeMachine, self).__init__()
    self.save_hyperparameters()
    corpus, self.vocab=self.build(self._download())
    array=torch.tensor([corpus[i:i+num_steps+1] for i in range(len(corpus)-num_steps)])
    self.X, self.y=array[:,:-1], array[:,1:]
  def get_dataloader(self,train):
    idx=slice(0,self.num_train) if train else slice(self.num_train,self.num_train+self.num_val)
    return self.get_tensorloader((self.X,self.y),train,idx)


In [12]:
class Vocab:
  def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
    if tokens and isinstance(tokens[0],list):
      tokens=[token for line in tokens for token in line]
    counter=collections.Counter(tokens)
    self.token_freqs=sorted(counter.items(), key=lambda x:x[1], reverse=True)
    self.idx_to_token=list(sorted(set(['<unk>']+reserved_tokens+[token for token, freq in self.token_freqs if freq>=min_freq])))
    self.token_to_idx={token:idx for idx, token in enumerate(self.idx_to_token)}
  def __len__(self):
    return len(self.idx_to_token)
  def __getitem__(self,tokens):
    if not isinstance(tokens,(list,tuple)):
      return self.token_to_idx.get(tokens,self.unk)
    return [self.__getitem__(token) for token in tokens]
  def to_tokens(self, indices):
    if hasattr(indices,'__len__') and len(indices)>1:
      return [self.idx_to_token[int(index)] for index in indices]
    return self.idx_to_token[indices]
  @property
  def unk(self):
    return self.token_to_idx['<unk>']

In [15]:
data=TimeMachine(batch_size=2, num_steps=10)
for X,Y in data.train_dataloader():
  print('X:', X, '\nY:', Y)
  break

Downloading ./data/timemachine.txt from http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt...
X: tensor([[13, 13, 26,  0,  7, 16, 22, 19,  0,  5],
        [ 6, 15, 20, 10, 16, 15, 20,  0, 17,  2]]) 
Y: tensor([[13, 26,  0,  7, 16, 22, 19,  0,  5, 10],
        [15, 20, 10, 16, 15, 20,  0, 17,  2, 19]])
