Skip to content

Commit

Permalink
Update WSDModel, add MFS baseline, add unit testing
Browse files Browse the repository at this point in the history
  • Loading branch information
LeonardoEmili committed Nov 21, 2021
1 parent cf3dd21 commit a159f6f
Show file tree
Hide file tree
Showing 35 changed files with 1,999 additions and 291 deletions.
65 changes: 53 additions & 12 deletions conf/data/default_data.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,53 @@
train_path: 'data/train.tsv'
validation_path: 'data/validation.tsv'
test_path: 'data/test.tsv'

train_ds: 'semcor'
semcor_data_path: 'data/WSD_Training_Corpora/SemCor/semcor.data.xml'
semcor_key_path: 'data/WSD_Training_Corpora/SemCor/semcor.gold.key.txt'
semcor_omsti_data_path: 'data/WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.data.xml'
semcor_omsti_key_path: 'data/WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.gold.key.txt'

batch_size: 16
num_workers: 1
train_path: "data/train.tsv"
validation_path: "data/validation.tsv"
test_path: "data/test.tsv"

train_ds: "semcor"
val_ds: "semeval2007"
test_ds: "semeval2015"

preprocessed_dir: "data/preprocessed/"
force_preprocessing: False
dump_preprocessed: True
use_synset_vocab: True

wordnet:
glosses: "data/wordnet/means/glosses.json"
lemma_means: "data/wordnet/means/lemma_means.json"
lexeme_means: "data/wordnet/means/lexeme_means.json"
sense_means: "data/wordnet/means/sense_means.json"

corpora:
semcor:
data_path: "data/WSD_Training_Corpora/SemCor/semcor.data.xml"
key_path: "data/WSD_Training_Corpora/SemCor/semcor.gold.key.txt"
semcor+omsti:
data_path: "data/WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.data.xml"
key_path: "data/WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.gold.key.txt"
omsti:
data_path: "data/WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.data.xml"
key_path: "data/WSD_Training_Corpora/SemCor+OMSTI/semcor+omsti.gold.key.txt"
semeval_all:
data_path: "data/WSD_Unified_Evaluation_Datasets/ALL/ALL.data.xml"
key_path: "data/WSD_Unified_Evaluation_Datasets/ALL/ALL.gold.key.txt"
semeval2007:
data_path: "data/WSD_Unified_Evaluation_Datasets/semeval2007/semeval2007.data.xml"
key_path: "data/WSD_Unified_Evaluation_Datasets/semeval2007/semeval2007.gold.key.txt"
semeval2013:
data_path: "data/WSD_Unified_Evaluation_Datasets/semeval2013/semeval2013.data.xml"
key_path: "data/WSD_Unified_Evaluation_Datasets/semeval2013/semeval2013.gold.key.txt"
semeval2015:
data_path: "data/WSD_Unified_Evaluation_Datasets/semeval2015/semeval2015.data.xml"
key_path: "data/WSD_Unified_Evaluation_Datasets/semeval2015/semeval2015.gold.key.txt"
senseval2:
data_path: "data/WSD_Unified_Evaluation_Datasets/senseval2/senseval2.data.xml"
key_path: "data/WSD_Unified_Evaluation_Datasets/senseval2/senseval2.gold.key.txt"
senseval3:
data_path: "data/WSD_Unified_Evaluation_Datasets/senseval3/senseval3.data.xml"
key_path: "data/WSD_Unified_Evaluation_Datasets/senseval3/senseval3.gold.key.txt"

batch_size: 32
num_workers: 0

min_freq_senses: 1
allow_multiple_senses: False
10 changes: 10 additions & 0 deletions conf/logging/wandb_logging.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
log: False

wandb_logger:
_target_: pytorch_lightning.loggers.WandbLogger
entity: LeonardoEmili
project: neural-wsd

watch:
log: 'all'
log_freq: 100
24 changes: 24 additions & 0 deletions conf/model/default_model.yaml
Original file line number Diff line number Diff line change
@@ -1 +1,25 @@
tokenizer: 'bert-base-cased'
model_name: 'bert-base-cased'
learning_rate: 1e-3
min_learning_rate: 1e-4
language_model_learning_rate: 1e-5
language_model_min_learning_rate: 1e-6
language_model_weight_decay: 1e-4
use_lemma_mask: False
use_lexeme_mask: False

word_encoder:
_target_: src.layers.word_encoder.WordEncoder
fine_tune: False
word_dropout: 0.2
model_name: ${model.model_name}

sequence_encoder: lstm
lstm_encoder:
_target_: torch.nn.LSTM
input_size: 512
hidden_size: 256
bidirectional: True
batch_first: True
num_layers: 2
dropout: 0.40
8 changes: 7 additions & 1 deletion conf/root.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# Required to make the "experiments" dir the default one for the output of the models
hydra:
run:
dir: ./experiments/${train.model_name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
dir: ./experiments/${model.model_name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

# Debug mode
debug: False
max_samples: 1000

defaults:
- train: default_train
- model: default_model
- data: default_data
- logging: wandb_logging
- test: default_test
3 changes: 3 additions & 0 deletions conf/test/default_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
checkpoint_path: <MODEL_CHECKPOINT_PATH>
latest_checkpoint_path: experiments/bert-base-cased/2021-11-16/23-06-26/default_name/epoch=2-step=3485.ckpt
use_latest: false
22 changes: 10 additions & 12 deletions conf/train/default_train.yaml
Original file line number Diff line number Diff line change
@@ -1,35 +1,33 @@
# reproducibility
seed: 42

# model name
model_name: default_name # used to name the directory in which model's checkpoints will be stored (experiments/model_name/...)
# experiment name
experiment_name: default_name

# pl_trainer
pl_trainer:
_target_: pytorch_lightning.Trainer
gpus: 1
accumulate_grad_batches: 4
gradient_clip_val: 10.0
val_check_interval: 1.0 # you can specify an int "n" here => validation every "n" steps
max_steps: 100_000
# uncomment the lines below for training with mixed precision
max_epochs: 20
fast_dev_run: False
# precision: 16
# amp_level: O2


# early stopping callback
# "early_stopping_callback: null" will disable early stopping
early_stopping_callback:
_target_: pytorch_lightning.callbacks.EarlyStopping
monitor: val_loss
mode: min
monitor: val_f1_micro
mode: max
patience: 50

# model_checkpoint_callback
# "model_checkpoint_callback: null" will disable model checkpointing
model_checkpoint_callback:
_target_: pytorch_lightning.callbacks.ModelCheckpoint
monitor: val_loss
mode: min
monitor: val_f1_micro
mode: max
verbose: True
save_top_k: 5
dirpath: experiments/${train.model_name}
dirpath: ${train.experiment_name}/
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[tool.black]
line-length = 120
target-version = ['py36', 'py37', 'py38']
include = '\.pyi?$'
7 changes: 5 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,8 @@ pytorch-lightning==1.2.5
torch==1.8.1
nltk==3.4.5
hydra-core==1.1.0.dev5
wandb==0.10.31
transformers==4.9.1
wandb==0.12.6
transformers==4.12.3
torchtext==0.9.1
black==21.9b0
python-dotenv==0.19.1
6 changes: 3 additions & 3 deletions setup.sh
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ source ~/miniconda3/etc/profile.d/conda.sh

# create conda env
read -rp "Enter environment name: " env_name
read -rp "Enter python version (e.g. 3.7) " python_version
read -rp "Enter python version (e.g. 3.9.7) " python_version
conda create -yn "$env_name" python="$python_version"
conda activate "$env_name"

# install torch
read -rp "Enter cuda version (e.g. '10.1' or 'none' to avoid installing cuda support): " cuda_version
if [ "$cuda_version" == "none" ]; then
conda install -y pytorch torchvision cpuonly -c pytorch
conda install -y pytorch cpuonly -c pytorch
else
conda install -y pytorch torchvision cudatoolkit=$cuda_version -c pytorch
conda install -y pytorch cudatoolkit=$cuda_version -c pytorch
fi

# install python requirements
Expand Down
14 changes: 14 additions & 0 deletions src/colab/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"python.defaultInterpreterPath": "/root/miniconda3/envs/neural-wsd/bin/python",
"python.formatting.provider": "black",
"python.formatting.blackArgs": [
"--line-length",
"120"
],
"files.exclude": {
"**/.classpath": true,
"**/.project": true,
"**/.settings": true,
"**/.factorypath": true
}
}
18 changes: 18 additions & 0 deletions src/colab/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# Downloads miniconda
wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
sh Miniconda3-latest-Linux-x86_64.sh -b
export PATH="/root/miniconda3/bin:${PATH}"
conda init

# Creates the environment
echo "Creating the environment"
source ~/miniconda3/etc/profile.d/conda.sh
conda create -qyn neural-wsd python=3.9.7
conda activate neural-wsd
pip install -r /content/neural-wsd/requirements.txt

# Configure vscode and overwrite default settings
code --install-extension ms-python.python
cp /content/neural-wsd/src/colab/settings.json /root/.vscode-server/data/Machine/settings.json
102 changes: 102 additions & 0 deletions src/colab/setup_colab.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Google Colab + VSCode"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yxChFURdSJfQ",
"outputId": "09dde9a4-528c-4557-e1b2-2f4a7d0a0578"
},
"outputs": [],
"source": [
"!nvidia-smi"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 381
},
"id": "aqXACjLhFX1C",
"outputId": "ffce36c3-93e9-41b1-90f7-4211ffbfd122"
},
"outputs": [],
"source": [
"!pip install -q colab_ssh python-dotenv --upgrade\n",
"\n",
"copy_env_from_gdrive = False\n",
"if copy_env_from_gdrive:\n",
" from google.colab import drive\n",
"\n",
" drive.mount(\"/content/drive\")\n",
"\n",
"import os\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()\n",
"\n",
"from colab_ssh import launch_ssh_cloudflared, init_git_cloudflared\n",
"\n",
"launch_ssh_cloudflared(password=os.getenv(\"CLOUDFLARED_PASSWORD\"))\n",
"\n",
"init_git_cloudflared(\n",
" repository_url=os.getenv(\"GITHUB_REPO_URL\"),\n",
" personal_token=os.getenv(\"GITHUB_PERSONAL_ACCESS_TOKEN\"),\n",
" branch=os.getenv(\"GITHUB_BRANCH\"),\n",
" email=os.getenv(\"GITHUB_EMAIL\"),\n",
" username=os.getenv(\"GITHUB_USERNAME\"),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-tTM1EMGJH0z",
"outputId": "8839b443-f0a6-4cc8-fea9-180b0b34002c"
},
"outputs": [],
"source": [
"# Install dependecies and configure bash\n",
"%%bash\n",
"source neural-wsd/src/colab/setup.sh\n",
"echo \"cd /content/neural-wsd/\" >> ~/.bashrc**\n",
"echo \"source ~/miniconda3/etc/profile.d/conda.sh\" >> ~/.bashrc**\n",
"echo \"conda activate neural-wsd\" >> ~/.bashrc**"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"name": "setup_colab.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

0 comments on commit a159f6f

Please sign in to comment.