From 4cb9321aed6dcd1073923e449da1f6040d23cfc3 Mon Sep 17 00:00:00 2001 From: Juan Pablo Tosca Villanueva Date: Tue, 27 Feb 2024 14:14:14 -0500 Subject: [PATCH 1/5] Add custom sized dataset --- .gitignore | 3 + README.md | 10 +- create_sample_custom_dataset.py | 19 ++++ .../dataverse-performance-demo.json | 12 ++ .../performance-test.json | 103 ++++++++++++++++++ dv_logo_hd.svg | 30 +++++ requirements.txt | 1 + 7 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 create_sample_custom_dataset.py create mode 100644 data/dataverses/dataverse-performance-demo/datasets/performance-test/dataverse-performance-demo.json create mode 100644 data/dataverses/dataverse-performance-demo/performance-test.json create mode 100644 dv_logo_hd.svg diff --git a/.gitignore b/.gitignore index 258d77ac..99db0217 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ __pycache__ dvconfig.py ec2-create-instance.sh venv +dv_logo_*.png +*.DS_Store +sample.sh diff --git a/README.md b/README.md index 2c755113..2a46a23b 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Activate the virtual environment you just created. Install dependencies into the virtual environment, especially [pyDataverse][]. - pip install -r requirements.txt + pip3 install -r requirements.txt Copy `dvconfig.py.sample` to `dvconfig.py` (see the `cp` command below) and add your API token (using your favorite text editor, which may not be `vi` as shown below). Note that the config file specifies which sample data will be created. @@ -35,12 +35,20 @@ Copy `dvconfig.py.sample` to `dvconfig.py` (see the `cp` command below) and add Note that the environment variable `$API_TOKEN` will override `api_token` in `dvconfig.py`. +## Adding a custom dataset with specific number of files + +If you want to create a dataset that include a specific number of files you can use this step + ## Adding sample data Assuming you have already run the `source` and `cd` commands above, you should be able to run the following command to create sample data. python create_sample_data.py + https://github.com/Kozea/CairoSVG/issues/392#issuecomment-1927435606 + + export DYLD_LIBRARY_PATH="/opt/homebrew/opt/cairo/lib:$DYLD_LIBRARY_PATH" + All of the steps above may be automated in a fresh installation of Dataverse on an EC2 instance on AWS by downloading [ec2-create-instance.sh][] and [main.yaml][]. Edit main.yml to set `dataverse.sampledata.enabled: true` and adjust any other settings to your liking, then execute the script with the config file like this: curl -O https://raw.githubusercontent.com/GlobalDataverseCommunityConsortium/dataverse-ansible/master/ec2/ec2-create-instance.sh diff --git a/create_sample_custom_dataset.py b/create_sample_custom_dataset.py new file mode 100644 index 00000000..8247ede5 --- /dev/null +++ b/create_sample_custom_dataset.py @@ -0,0 +1,19 @@ +import random +import re +import cairosvg + +#from CairoSVG import svg2png + +generated_files = input('Number of files to generate: ') +target_path = './data/dataverses/dataverse-performance-demo/datasets/performance-test/files' + +with open('dv_logo_hd.svg', 'r') as file: + svg_code = file.read() + +for iteration in range(int(generated_files)): + random_color = '#' + ''.join(random.choices('0123456789ABCDEF', k=6)) + svg_code_tmp = re.sub(r'#c65b28', random_color, svg_code) + destination_path = ( + f"{target_path}/dv_logo_{str(iteration).zfill(5)}.png" + ) + cairosvg.svg2png(bytestring=svg_code_tmp, write_to=destination_path) \ No newline at end of file diff --git a/data/dataverses/dataverse-performance-demo/datasets/performance-test/dataverse-performance-demo.json b/data/dataverses/dataverse-performance-demo/datasets/performance-test/dataverse-performance-demo.json new file mode 100644 index 00000000..94c1862e --- /dev/null +++ b/data/dataverses/dataverse-performance-demo/datasets/performance-test/dataverse-performance-demo.json @@ -0,0 +1,12 @@ +{ + "name": "Dataverse performance demo", + "alias": "dataverse-performance-demo", + "dataverseContacts": [ + { + "contactEmail": "juan_tosca@iq.harvard.edu" + } + ], + "affiliation": "Harvard University", + "description": "Demo created for performance testing", + "dataverseType": "RESEARCH_PROJECTS" +} diff --git a/data/dataverses/dataverse-performance-demo/performance-test.json b/data/dataverses/dataverse-performance-demo/performance-test.json new file mode 100644 index 00000000..4faa8a05 --- /dev/null +++ b/data/dataverses/dataverse-performance-demo/performance-test.json @@ -0,0 +1,103 @@ +{ + "datasetVersion": { + "id": 4, + "datasetId": 12, + "datasetPersistentId": "doi:10.5072/FK2/JPT050", + "storageIdentifier": "file://10.5072/FK2/JPT050", + "versionNumber": 1, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "UNF": "UNF:6:VDyWtJrNd0VRwAumtzYA1Q==", + "lastUpdateTime": "2021-09-20T18:38:32Z", + "releaseTime": "2021-09-20T18:38:32Z", + "createTime": "2021-09-20T18:16:38Z", + "license": "CC0 1.0", + "termsOfUse": "CC0 Waiver", + "fileAccessRequest": false, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Dataverse performance test dataset" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Juan Pablo Tosca Villanueva" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Juan Pablo Tosca Villanueva" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "This is a test dataset to measure the performance of the Dataverse software." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Social Sciences" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Admin, Dataverse" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2024-05-05" + } + ] + } + }, + "citation": "IQSS, 2024, \"Dataverse performance test\", https://doi.org/10.5072/FK2/QWVFK0, Root, V1, UNF:6:VDyWtJrNd0VRwAumtzYA1Q== [fileUNF]" + } + } \ No newline at end of file diff --git a/dv_logo_hd.svg b/dv_logo_hd.svg new file mode 100644 index 00000000..2644512e --- /dev/null +++ b/dv_logo_hd.svg @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4a1177ee..3c036a0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ pyDataverse==0.2.1 +CairoSVG==2.7.1 \ No newline at end of file From 8d0cb0f8031bcb2ff3cd1ebf6be311d6fb7643be Mon Sep 17 00:00:00 2001 From: Juan Pablo Tosca Villanueva Date: Tue, 27 Feb 2024 16:08:33 -0500 Subject: [PATCH 2/5] Files location correction and update to config sample --- .../{ => datasets/performance-test}/performance-test.json | 2 +- .../performance-test => }/dataverse-performance-demo.json | 0 dvconfig.py.sample | 2 ++ 3 files changed, 3 insertions(+), 1 deletion(-) rename data/dataverses/dataverse-performance-demo/{ => datasets/performance-test}/performance-test.json (97%) rename data/dataverses/dataverse-performance-demo/{datasets/performance-test => }/dataverse-performance-demo.json (100%) diff --git a/data/dataverses/dataverse-performance-demo/performance-test.json b/data/dataverses/dataverse-performance-demo/datasets/performance-test/performance-test.json similarity index 97% rename from data/dataverses/dataverse-performance-demo/performance-test.json rename to data/dataverses/dataverse-performance-demo/datasets/performance-test/performance-test.json index 4faa8a05..30320440 100644 --- a/data/dataverses/dataverse-performance-demo/performance-test.json +++ b/data/dataverses/dataverse-performance-demo/datasets/performance-test/performance-test.json @@ -98,6 +98,6 @@ ] } }, - "citation": "IQSS, 2024, \"Dataverse performance test\", https://doi.org/10.5072/FK2/QWVFK0, Root, V1, UNF:6:VDyWtJrNd0VRwAumtzYA1Q== [fileUNF]" + "citation": "IQSS, 2024, \"Dataverse performance test\", https://doi.org/10.5072/FK2/JPT050, Root, V1" } } \ No newline at end of file diff --git a/data/dataverses/dataverse-performance-demo/datasets/performance-test/dataverse-performance-demo.json b/data/dataverses/dataverse-performance-demo/dataverse-performance-demo.json similarity index 100% rename from data/dataverses/dataverse-performance-demo/datasets/performance-test/dataverse-performance-demo.json rename to data/dataverses/dataverse-performance-demo/dataverse-performance-demo.json diff --git a/dvconfig.py.sample b/dvconfig.py.sample index 8f97f284..fb7583f4 100644 --- a/dvconfig.py.sample +++ b/dvconfig.py.sample @@ -25,6 +25,8 @@ sample_data = [ 'data/dataverses/open-source-at-harvard/datasets/open-source-at-harvard/open-source-at-harvard.json', 'data/dataverses/king/king.json', 'data/dataverses/king/datasets/cause-of-death/cause-of-death.json', +'data/dataverses/dataverse-performance-demo/dataverse-performance-demo.json', +'data/dataverses/dataverse-performance-demo/datasets/performance-test/performance-test.json', ] # put this back at line 6 once https://github.com/IQSS/dataverse/pull/6924 is merged From 8b7f2ffd4da0ce283bc4cdf679aa25e96958e32e Mon Sep 17 00:00:00 2001 From: Juan Pablo Tosca Villanueva Date: Tue, 27 Feb 2024 16:24:34 -0500 Subject: [PATCH 3/5] Instructions --- README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2a46a23b..a29f0fc0 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,16 @@ Note that the environment variable `$API_TOKEN` will override `api_token` in `dv ## Adding a custom dataset with specific number of files -If you want to create a dataset that include a specific number of files you can use this step +If you want to create a dataset that include a specific number of Dataverse logos with randomized color you can use: + + python create_sample_custom_dataset.py + +You will require to input how many files do you want to create, this step has to be done before you add the sample data or the dataset Dataverse performance test dataset will be empty. + +If you experience any cairo errors please declare the following env variable as documented [here](https://github.com/Kozea/CairoSVG/issues/392#issuecomment-1927435606 +): + + export DYLD_LIBRARY_PATH="/opt/homebrew/opt/cairo/lib:$DYLD_LIBRARY_PATH" ## Adding sample data From 2f2163b8f169506dc6f6e55937e94aadad580118 Mon Sep 17 00:00:00 2001 From: Juan Pablo Tosca Villanueva Date: Wed, 28 Feb 2024 14:43:44 -0500 Subject: [PATCH 4/5] Docs --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a29f0fc0..d5033e51 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,13 @@ Note that the environment variable `$API_TOKEN` will override `api_token` in `dv ## Adding a custom dataset with specific number of files -If you want to create a dataset that include a specific number of Dataverse logos with randomized color you can use: +You can add a specific number of files to the dataset "Dataverse performance test dataset" with: python create_sample_custom_dataset.py -You will require to input how many files do you want to create, this step has to be done before you add the sample data or the dataset Dataverse performance test dataset will be empty. +You will be asked how many files do you want to create and will generate the specified number of files with the dataverse logo with a random color with the PNG extension. This step has to be done before you add the data or the dataset will be empty. -If you experience any cairo errors please declare the following env variable as documented [here](https://github.com/Kozea/CairoSVG/issues/392#issuecomment-1927435606 +If you experience the `OSError: no library called "cairo-2" was found` error please declare the following env variable as documented [here](https://github.com/Kozea/CairoSVG/issues/392#issuecomment-1927435606 ): export DYLD_LIBRARY_PATH="/opt/homebrew/opt/cairo/lib:$DYLD_LIBRARY_PATH" From 28ee998f085a5ebab2baf6f0716f93f1a1617640 Mon Sep 17 00:00:00 2001 From: Juan Pablo Tosca Villanueva Date: Wed, 28 Feb 2024 14:47:36 -0500 Subject: [PATCH 5/5] Docs --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d5033e51..82b1ddf0 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ You can add a specific number of files to the dataset "Dataverse performance tes python create_sample_custom_dataset.py -You will be asked how many files do you want to create and will generate the specified number of files with the dataverse logo with a random color with the PNG extension. This step has to be done before you add the data or the dataset will be empty. +You'll be prompted to specify the number of files you wish to create. The application will then generate the requested number of files, each one with the Dataverse logo in a randomly chosen color. These files will be in PNG format. It's important to complete this step before adding any data, as the dataset will otherwise be empty. If you experience the `OSError: no library called "cairo-2" was found` error please declare the following env variable as documented [here](https://github.com/Kozea/CairoSVG/issues/392#issuecomment-1927435606 ):