# Data and resources

Run this notebook to download, unzip, and store all the data and resources necessary to run the experiments.

In [1]:
import wget
import pathlib

### _Stories_ dataset

Obtain dataset from COLING 2018 paper:
> Jahan, Labiba, Geeticka Chauhan, and Mark Finlayson. "A new approach to animacy detection." In _Proceedings of the 27th International Conference on Computational Linguistics_, pp. 1-12. 2018.

Download data from https://dspace.mit.edu/handle/1721.1/116172, unzip it, and store it in `../resources/`.

In [None]:
url = "https://dspace.mit.edu/bitstream/handle/1721.1/116172/jahan_animacy_v1.0.0.zip"
jahan = wget.download(url)

In [None]:
save_to = "../resources"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)

In [None]:
!mv {jahan} {save_to}

In [None]:
!unzip -q {save_to}/{jahan} -d {save_to}
!ls {save_to}

### _Machines19thC_ dataset

In [None]:
# TO DO

### _BERT_ models

#### 1. 19thC BERT models

_[coming soon]_

#### 2. Sentence Transformers BERT

In [None]:
url = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip"
bert_senttrans = wget.download(url)
save_to = "../models/language_models/bert_models/"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)
pathlib.Path(save_to + "bert-base-nli-mean-tokens").mkdir(parents=True, exist_ok=True)
!mv {bert_senttrans} {save_to}
!unzip -q {save_to}/{bert_senttrans} -d {save_to}/bert-base-nli-mean-tokens
!rm {save_to}/{bert_senttrans}
!ls {save_to}

#### 3. scikit-learn wrapper for BERT

Source: https://github.com/charles9n/bert-sklearn

In [2]:
save_to = "../models/language_models/bert_models/bert-sklearn/"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)
!git clone -b master https://github.com/charles9n/bert-sklearn {save_to}
!pip install {save_to}

Cloning into '../models/language_models/bert_models/bert-sklearn'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 259 (delta 3), reused 3 (delta 0), pack-reused 247[K
Receiving objects: 100% (259/259), 519.36 KiB | 1.42 MiB/s, done.
Resolving deltas: 100% (125/125), done.
Processing /Users/mcollardanuy/Documents/githubCode/AtypicalAnimacy/models/language_models/bert_models/bert-sklearn
Building wheels for collected packages: bert-sklearn
  Building wheel for bert-sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for bert-sklearn: filename=bert_sklearn-0.3.1-py3-none-any.whl size=54234 sha256=7ca670f2d3cf0b28ac2d3f01c0f345ee31064420264a1b5267fc67a9ea999c63
  Stored in directory: /Users/mcollardanuy/Library/Caches/pip/wheels/40/0b/29/e8f285d937984523cbfb153de2b95d7c5f2cd5bf37583aab6a
Successfully built bert-sklearn
Installing collected packages: bert-sklearn
Successfully

### _fast.ai_ model

In [None]:
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz"
fastai_model = wget.download(url)
save_to = "../models/language_models/fastai/"
pathlib.Path(save_to).mkdir(parents=True, exist_ok=True)
!mv {fastai_model} {save_to}
!gunzip -q {save_to}/{fastai_model} -d {save_to}
!ls {save_to}

### _spaCy_ models

In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg