In [1]:
from ethoscopy.metadata_db import db_organiser, metadata_handler, db_crawler

In [2]:

# This is the step in which we crawl through our folder containing all the db files from the lab
# If refresh is set to True, it will scan all the files and generate a new csv list of all the files available in the system
# If refresh is set to False, it will simply load a pre-compiled csv list (like in the case of this tutorial)
# We provide a copy of `ethoscope_db.csv` which list all the 16.8k db files of the Gilestro lab as of Dec 2022

datapath = "/mnt/ethoscope_results"
db = db_organiser(datapath, refresh=False)

In [3]:
# Now we load the metadata file and do the first processing 
# to try and understand how many db files we should be looking for
# This is a real dataset from Jones_et_al 2023

metadata_filename = 'jones_et_al_metadata.csv'
meta = metadata_handler(metadata_filename, project='coccinella_2022', authors=['Jones'] )
meta.db_files

Unnamed: 0,date,machine_name,count
0,2016-04-04,ETHOSCOPE_001,20
1,2016-04-04,ETHOSCOPE_003,20
2,2016-04-04,ETHOSCOPE_004,20
3,2016-04-04,ETHOSCOPE_005,20
4,2016-04-04,ETHOSCOPE_020,20
...,...,...,...
376,2022-02-24,ETHOSCOPE_125,24
377,2022-02-24,ETHOSCOPE_143,24
378,2022-02-24,ETHOSCOPE_151,24
379,2022-02-24,ETHOSCOPE_218,24


In [4]:
# This is where we go through the metadata and find the relevant db files
meta.associate_to_db(db)
meta.summary

{'filename': 'all_ethoscope_metadata.csv',
 'entries': 8720,
 'columns': ['date', 'machine_name', 'region_id', 'db_filename', 'filesize'],
 'db_files': 381,
 'info_mtime': '2022-12-01 16:58',
 'entries_not_found': 72,
 'db_files_na': 3,
 'db_files_size': '168.73 GB',
 'tags': [],
 'project': 'coccinella_2022',
 'description': '',
 'authors': ['Jones'],
 'paper_doi': ''}

In [5]:
# It seems we have 8720 in our metadata list, corresponding to 381 files
# However, we could not find 3 of these 381, corresponding to 72 animals
# To see which experiments are missing (probably because they were mislabeled) use the following:
meta.list_dbs(notfound = True)

Unnamed: 0,date,machine_name,db_filename,filesize,count
166,2020-08-17,ETHOSCOPE_220,,,24
183,2020-09-20,ETHOSCOPE_141,,,24
184,2020-09-20,ETHOSCOPE_218,,,24


In [6]:
# Which can be saved in the following way
meta.list_dbs(notfound = True).to_csv('jones_et_al_dbs_not_found.txt')

In [7]:
# Conversely, if we want to write all the other succesfully found db to a csv we can use the following:
meta.list_dbs().to_csv('jones_et_al_all_info.txt')

In [8]:
# Those options however are rather verbose, providing also info about the db filesize and so on
# IF we want only the filenames and location of each db we can use
meta.list_dbs().db_filename.to_csv('jones_et_al_only_dbs.txt', header=None, index=None)

### Upload all the db files to Zenodo

Now we have a handy list of all the db files associated to our experiments, ready to be shared with the world.
Notice that these 381 files take 168.73 GB which is above the 50GB limit that we would have on a Zenodo repository.
We will have to zip them and use three repositories. **The following is not Python but bash!**

First we need to copy all these files to a temporary destination
The following will create a copy of all the files we want to upload in the repo

```
mkdir /mnt/data/temporary/jones_et_al
rsync -aruRP --files-from=jones_et_al_only_dbs.txt /mnt/data/results /mnt/data/temporary/jones_et_al/
```

Now we zip all these files in multiple archives, each 2GB in size
```
zip -r -s 2000m jones_et_al_db_files.zip jones_et_al
```

Now create some descriptive files of what is in the archive
```
cd /mnt/data/temporary/jones_et_al
```
create a txt file providing the md5sum of each zip archive
```
find . -type f -name "*.z*" -exec md5sum {} + > ../all_zips_with_checksum.txt
```

same thing but for the original uncompressed db files
```
find . -type f -name "*.db" -exec md5sum {} + > ../all_dbs_with_checksum.txt
```

visualize the structure of the folders / files
```
tree > jones_et_al/db_files_content.txt
```