# Data and resources

Run this notebook to download, unzip, and store all the data and resources necessary to run the experiments.

In [None]:
import wget
import pathlib
import zipfile

### _Stories_ dataset

Obtain dataset from COLING 2018 paper:
> Jahan, Labiba, Geeticka Chauhan, and Mark Finlayson. "A new approach to animacy detection." In _Proceedings of the 27th International Conference on Computational Linguistics_, pp. 1-12. 2018.

Download data from https://dspace.mit.edu/handle/1721.1/116172, unzip it, and store it in `../resources/`.

In [None]:
url = "https://dspace.mit.edu/bitstream/handle/1721.1/116172/jahan_animacy_v1.0.0.zip"
jahan = wget.download(url)

In [None]:
save_to = "../resources"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)

In [None]:
!mv {jahan} {save_to}

In [None]:
!unzip -q {save_to}/{jahan} -d {save_to}
!ls {save_to}

### _Machines19thC_ dataset

Download data from https://bl.iro.bl.uk/work/ns/323177af-6081-4e93-8aaf-7932ca4a390a, unzip it, and store it in `../resources/`.

In [None]:
url = "https://bl.iro.bl.uk/concern/parent/323177af-6081-4e93-8aaf-7932ca4a390a/file_sets/59a8c52f-e0a5-4432-9897-0db8c067627c"
machines19thC = wget.download(url)

In [None]:
save_to = "../resources"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)

In [None]:
!mv {machines19thC} {save_to}

In [None]:
!unzip -q {save_to}/{machines19thC} -d {save_to}
!ls {save_to}

### _BERT_ models

#### 1. 19thC BERT models

In [None]:
url = "https://zenodo.org/record/4782245/files/bert.zip?download=1"
bert19th_model = wget.download(url)
save_to = "../models/language_models/bert_models/"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)
!mv {bert19th_model} {save_to}
with zipfile.ZipFile(save_to + bert19th_model, 'r') as zip_ref:
    zip_ref.extractall(save_to)
!ls {save_to}

#### 2. Sentence Transformers BERT

In [None]:
url = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip"
bert_senttrans = wget.download(url)
save_to = "../models/language_models/bert_models/"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)
pathlib.Path(save_to + "bert-base-nli-mean-tokens").mkdir(parents=True, exist_ok=True)
!mv {bert_senttrans} {save_to}
!unzip -q {save_to}/{bert_senttrans} -d {save_to}/bert-base-nli-mean-tokens
!rm {save_to}/{bert_senttrans}
!ls {save_to}

#### 3. scikit-learn wrapper for BERT

Source: https://github.com/charles9n/bert-sklearn

In [None]:
save_to = "../models/language_models/bert_models/bert-sklearn/"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)
!git clone -b master https://github.com/charles9n/bert-sklearn {save_to}
!pip install {save_to}

### _fast.ai_ model

In [None]:
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz"
fastai_model = wget.download(url)
save_to = "../models/language_models/fastai/"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)
!mv {fastai_model} {save_to}
!gunzip -q {save_to}/{fastai_model} -d {save_to}
!ls {save_to}

### _spaCy_ models

In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

### _Nltk_ features

In [None]:
import nltk

In [None]:
nltk.download('punkt')