# GBIF Exploration Notebook

Due to strange behavior when using `polars` to read the `.csv` files with `"`, we are going to explore where `polars` started to fail and generate a new file where this behavior does not happen.

In [1]:
import os
import time
import shutil
import warnings
from tqdm.notebook import tqdm
from dwca import DarwinCoreArchive
from dwca.classes import DataFile
from typing import List, Tuple

In [2]:
folder = "data/chile-dwca"

In [3]:
gbif_chile = DarwinCoreArchive.from_file("data/chile-dwca.zip", lazy=True)
gbif_chile

  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespace for "
  warn(f"{element.get('term')} not in expected namespac

<Darwin Core Archive (10.15468/dl.p8492v [Core: http://rs.tdwg.org/dwc/terms/Occurrence, Entries: 5584600])>

In [4]:
given_cols = len(gbif_chile.core.fields)
print(f"Columns according to XML file in Core DataFile: {given_cols}")

Columns according to XML file in Core DataFile: 223


In [5]:
for data_file in [gbif_chile.core] + gbif_chile.extensions:
    print(f"{data_file.filename}")
    print(f"Rows according to polars: {len(data_file)}")

occurrence.txt
Rows according to polars: 5584600
multimedia.txt
Rows according to polars: 610175
verbatim.txt
Rows according to polars: 4917150


In [6]:
def count_lines(source_path: str) -> int:
    i = -1
    with open(source_path, "r") as src:
        for line in src:
            i += 1
    return i

In [7]:
source_occ = os.path.join(folder, "occurrence.txt")
total = count_lines(source_occ)
print(f"Rows according to the lines in the file {total}")

Rows according to the lines in the file 10665951


We have less rows in the polars LazyFrame than in the file. Let's explore when that happen:

In [8]:
def copy_until_line(source_path: str, destination_path: str, stop_line: int) -> None:
    with open(source_path, "r") as src, open(destination_path, "w") as dst:
        for current_line, content in enumerate(src):
            if current_line > stop_line:
                break
            dst.write(content)
    return

In [9]:
def count_valid_rows(file_path: str, data_file: DataFile, expected: int) -> bool:
    data_file.close()
    with open(file_path, "rb") as file:
        data_file.read_file("", source_file=file, lazy=True)
    out = len(data_file) == expected
    data_file.close()
    return out

In [10]:
dest_occ = os.path.join(folder, "occurrence_var.txt")

In [11]:
window = 1000000

In [12]:
prev_count = 1
current_count = total
with warnings.catch_warnings(action="ignore"):
    while window != 0:
        first = True
        for i in range(prev_count, current_count, window):
            print(f"Trying with {i} and windows of {window} from {prev_count} to {current_count}", end="\r")
            if first:
                time.sleep(5)
                first = False
            copy_until_line(source_occ, dest_occ, i)
            valid = count_valid_rows(dest_occ, gbif_chile.core, i)
            if not valid:
                print("")
                current_count = i
                print(f"Error on {i} lines, changing window")
                break
            else:
                prev_count = i
        window = window // 10
os.remove(dest_occ)

Trying with 1000001 and windows of 1000000 from 1 to 10665951
Error on 1000001 lines, changing window
Trying with 100001 and windows of 100000 from 1 to 1000001
Error on 100001 lines, changing window
Trying with 20001 and windows of 10000 from 10001 to 100001
Error on 20001 lines, changing window
Trying with 16001 and windows of 1000 from 15001 to 20001
Error on 16001 lines, changing window
Trying with 15601 and windows of 100 from 15501 to 16001
Error on 15601 lines, changing window
Trying with 15598 and windows of 1 from 15597 to 156011
Error on 15598 lines, changing window


Exploring the resulting file, the disparity came with the first `"` unpaired:

```
4009932728				CC_BY_NC_4_0	2023-09-16T01:57:52Z	iNaturalist.org	https://www.inaturalist.org/observations/144794560	Javiera San Martin Gatica					iNaturalist	Observations	iNaturalist research-grade observations		HUMAN_OBSERVATION				https://www.inaturalist.org/observations/144794560	144794560		Javiera San Martin Gatica															PRESENT																							2022-11-09T09:52	09:52:00-03:00	313	313	2022	11	9	2022/11/09 9:52 AM											SOUTH_AMERICA				CL	Araucanía				38°39'34. 72°36'20.2"W, Trabunco, Temuco, Araucanía, Chile								-38.659496	-72.605606	184.0																															321859002				Javiera San Martin Gatica		2022-12-21T15:35:01				518443		6957761						Apterodorcus bacchus (Hope, 1845)								Animalia	Arthropoda	Insecta	Coleoptera		Lucanidae				Apterodorcus	Apterodorcus			bacchus			SPECIES				ACCEPTED			50c9509d-22c7-4a22-a47d-8c48425ef4a7	US	2024-10-19T13:45:35.540Z						CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH_TAXON_ID_IGNORED	StillImage	true	false	6957761	6957761	1	54	216	1470	3263244	4736441		6957761	Apterodorcus bacchus	Apterodorcus bacchus (Hope, 1845)	Apterodorcus bacchus		DWC_ARCHIVE	2024-10-19T13:45:35.540Z	2024-10-18T20:49:13.203Z	true		null	false	LATIN_AMERICA	NORTH_AMERICA	CHL	Chile	CHL.3_1	Araucanía	CHL.3.1_1	Cautín	CHL.3.1.17_1	Temuco	NE
```

Specefically:

```
38°39'34. 72°36'20.2"W
```

So, we are going to change all the `"` with two `'`.

In [13]:
output_folder = "data/chile-dwca-report"
os.makedirs(output_folder, exist_ok=True)

In [14]:
new_folder = "data/chile-dwca-var"
os.makedirs(new_folder, exist_ok=True)

In [15]:
def check_lines(file_path: str, new_file: str) -> List[Tuple[int, str, str]]:
    errors = list()
    i = 0
    j = 0
    first_line = 0
    with open(file_path, "r", encoding="utf-8") as file:
        with open(new_file, "w", encoding="utf-8") as var_file:
            for line in tqdm(file):
                if i == 0:
                    first_line = len(line.split("\t"))
                lines = len(line.split("\t"))
                if lines != first_line:
                    errors.append((
                        i,
                        f"Less lines than first line {first_line} != {lines}",
                        line
                    ))
                elif '"' in line:
                    var_file.write(line.replace("\"", "'"))
                    j += 1
                else:
                    var_file.write(line)
                    j += 1
                i += 1
    print(f"Expected columns according to first line: {first_line}")
    print(f"Expected rows: {i} (First row header: {i - 1})")
    print(f"New files has {j} rows (First row header: {j - 1})")
    return errors

In [16]:
for file_name in ["occurrence", "multimedia", "verbatim"]:
    print(f"====== {file_name} ======")
    errors = check_lines(
        os.path.join(folder, f"{file_name}.txt"),
        os.path.join(new_folder, f"{file_name}.txt")
    )
    with open(os.path.join(output_folder, f"{file_name}.log"), "w", encoding="utf-8") as log_file:
        with open(os.path.join(output_folder, f"{file_name}.txt"), "w", encoding="utf-8") as file:
            for row, reason, line in errors:
                log_file.write(f"{row}\t{reason}\n")
                file.write(line)
    print("")



0it [00:00, ?it/s]

Expected columns according to first line: 223
Expected rows: 10665952 (First row header: 10665951)
New files has 10665952 rows (First row header: 10665951)



0it [00:00, ?it/s]

Expected columns according to first line: 15
Expected rows: 610176 (First row header: 610175)
New files has 610176 rows (First row header: 610175)



0it [00:00, ?it/s]

Expected columns according to first line: 190
Expected rows: 10665952 (First row header: 10665951)
New files has 10665952 rows (First row header: 10665951)



In [17]:
gbif_chile.core.close()
for i, _ in enumerate(gbif_chile.extensions):
    gbif_chile.extensions[i].close()

Check if now the `_var/_` file has the same number of rows:

In [18]:
for data_file in [gbif_chile.core] + gbif_chile.extensions:
    print(f"====== {data_file.filename} ======")
    file_path = os.path.join("data/chile-dwca-var", data_file.filename)
    with open(file_path, "rb") as file:
        data_file.read_file("", source_file=file, lazy=True)
    print(f"Rows according to polars: {len(data_file)}")
    total = count_lines(file_path)
    print(f"Rows according to the lines in the file {total}")



  warn("Reading in lazy evaluation mode generates a temporal file, make sure to call close() to "


Rows according to polars: 10665951
Rows according to the lines in the file 10665951
Rows according to polars: 610175
Rows according to the lines in the file 610175
Rows according to polars: 10665951
Rows according to the lines in the file 10665951


In [19]:
gbif_chile.core.close()
for i, _ in enumerate(gbif_chile.extensions):
    gbif_chile.extensions[i].close()

Now, we are going to generate the Darwin Core Archive for this new version:

In [20]:
for file in os.listdir(folder):
    if file not in os.listdir(new_folder):
        file_path = os.path.join(folder, file)
        if os.path.isfile(file_path):
            shutil.copy2(file_path, new_folder)
        elif os.path.isdir(file_path):
            shutil.copytree(file_path, os.path.join(new_folder, file))
        else:
            print(f"Copied: {source_file} to {destination_file}")

In [21]:
!cd data/chile-dwca-var && zip -r chile-dwca-var.zip .
!cd ../..
!mv data/chile-dwca-var/chile-dwca-var.zip data

  adding: verbatim.txt (deflated 89%)
  adding: meta.xml (deflated 89%)
  adding: dataset/ (stored 0%)
  adding: dataset/2987e655-3587-4017-b9f4-b15e32cfcf2e.xml (deflated 79%)
  adding: dataset/42fc31bd-81d7-48eb-949b-d0b1e73b34b4.xml (deflated 63%)
  adding: dataset/5d358c1d-86c8-4512-9068-abccfa192e09.xml (deflated 80%)
  adding: dataset/d7935e1b-a3af-427e-afd7-88ee385dc1e7.xml (deflated 70%)
  adding: dataset/853ddf57-13fb-4bb4-b41a-3160066a9415.xml (deflated 73%)
  adding: dataset/4b7aba1e-b67c-4b7f-aa77-900ad5c7fa80.xml (deflated 71%)
  adding: dataset/0ead7539-70d6-4486-9a37-2c273413ec51.xml (deflated 78%)
  adding: dataset/80faa258-c22e-4c99-af1a-29fa6d65e2cb.xml (deflated 82%)
  adding: dataset/b55b3fb1-bbda-4263-95e5-f456de409239.xml (deflated 77%)
  adding: dataset/b861873d-0fbf-409e-8c55-56197472963d.xml (deflated 68%)
  adding: dataset/b1eb82cd-2851-49c4-9397-bbddbb965cc1.xml (deflated 69%)
  adding: dataset/394f08a6-04f6-49cb-8cf1-81f0c819ad2d.xml (deflated 78%)
  adding:

<table>
    <tr>
        <td colspan="3" style="text-align: center;"><p>BIODATA - <a href="https://ieb-chile.cl/en/" target="_blank">Institute of Ecology and Biodiversity</a> © 2024</p></td>
    </tr>
</table>