# Version for the old dataset

### Formatting the dataset lables

We import the dataset lables in the format given to us in `xlsx` and load them into a pandas dataframe to be able to work with the with more ease. We also do some cleanup by setting NaN for unlabled tags. Also somewhere around line 226 or 266 i think i had to change a Day from 26 14 to 26-14 since I'm looking for spaces to identify the days that were wrongly interpreted as dates


In [None]:
import pandas as pd
import numpy as np

# Read the xlsx file
df = pd.read_excel('../data/lables.xlsx')

# Replace all occurrences of 'x' with NaN
df.replace('x', np.NaN, inplace=True)

columns_to_format = ['Giorno', 'Mese', 'Anno', 'Altitudine']

# Drop the lines were an error occurs for know
for index, row in df.iterrows():
    try:
        # Loop through the different fields to which we want to apply it
        for column in columns_to_format:
            if pd.notna(row[column]):
                row[column] = pd.to_datetime(row[column], errors='coerce').strftime('%-d-%-m')
    except ValueError:
        # Drop the rows that are too wierd
        df.drop(index, inplace=True)
        
# Change the days where they where automatically formatted to dates for two different fields
for column in columns_to_format:
    df[column] = df[column].apply(lambda x: pd.to_datetime(x, errors='coerce').strftime('%-d-%-m') if (not pd.isna(x) and ' ' in str(x)) else x)

### Creating the metadata.json

We have a folder with images and we want to create the `metadata.json` file  which associate text from the dataframe to the images as ground truth. This is necessary for the `imagefolder` feature of `datasets`.
    
The `metadata.json` should look at the end similar to the example below.

```json
    {"file_name": "0001.png", "ground_truth": "This is a golden retriever playing with a ball"}
    {"file_name": "0002.png", "ground_truth": "A german shepherd"}
```
In our example will `\"text\"` column contain the OCR text of the image, which will later be used for creating the Donut specific format.

In [None]:
from pathlib import Path
import json
import os

# define paths
image_path = "../data/img/"

# define metadata list
metadata_list = []

# loop through rows of dataframe
for index, row in df.iterrows():
    # Do it only for the ones who have null note and others to avoid wierd things
    if pd.isnull(row['Note']) and pd.isnull(row['Determinavit']) and pd.isnull(row['Legit']):
       
        # Fill the NaN in the row wiht the empty string
        row = row.fillna(' ')
        
        # create dictionary with metadata for this row
        metadata_dict = {
            "Nome_verbatim": row['Nome_verbatim'],
            "Locality": row['Località di raccolta'],
            "Elevation": row['Altitudine'],
            "Day": row['Giorno'],
            "Month": row['Mese'],
            "Year": row['Anno'],
        }
        # create dictionary with "file_name" and "text" keys
        metadata_list.append({
            "ground_truth": json.dumps(metadata_dict),
            "file_name": f"{str(row['ID']).zfill(5)}.jpg"
        })

# write jsonline file to the image_path
jsonl_file_path = os.path.join(image_path, 'metadata.jsonl')
with open(jsonl_file_path, 'w') as outfile:
    for entry in metadata_list:
        json.dump(entry, outfile)
        outfile.write('\n')

#### Delete images that are not in the metadata.json

---

After I have created a copy I can delete the images that are not in the metadata from the folder that I actually use

In [None]:
from tqdm import tqdm

image_path = "../data/img/"
metadata_file = image_path + "metadata.jsonl"

# Load the list of image files from the metadata file
with open(metadata_file, 'r') as f:
    metadata_list = [json.loads(line)['file_name'] for line in f]

# Count the number of deleted files
deleted_count = 0

# Create a progress bar
with tqdm(total=len(os.listdir(image_path)), desc="Going through files") as pbar:
    # Delete image files that don't have metadata
    for file_name in os.listdir(image_path):
        if file_name.endswith('.jpg') and file_name not in metadata_list:
            os.remove(os.path.join(image_path, file_name))
            deleted_count += 1
        pbar.update(1)

print(f"Number of files deleted: {deleted_count}")

#### Smaller image size dataset

Create a copy of the images with a half of the size and rotate them in necessary do everything with a progressbar

In [None]:
import json
from PIL import Image, UnidentifiedImageError
import shutil
import os
from tqdm import tqdm

input_dir = "../data/img/"
output_dir = "../data/img_resized"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

image_files = [filename for filename in os.listdir(input_dir) if filename.endswith(".jpg")]

metadata_file = input_dir + "metadata.jsonl"
metadata = []

with open(metadata_file, "r") as f:
    for line in f:
        metadata.append(json.loads(line))

updated_metadata = []

for filename in tqdm(image_files, desc="Processing images"):
    try:
        with Image.open(os.path.join(input_dir, filename)) as img:
            resized_img = img.resize((1600, 1200))
            
            exif_data = img.info.get('exif')
            if exif_data is not None:
                resized_img.save(os.path.join(output_dir, filename), exif=exif_data)
            else:
                resized_img.save(os.path.join(output_dir, filename))

            for entry in metadata:
                if entry["file_name"] == filename:
                    updated_metadata.append(entry)
                    break
    except FileNotFoundError:
        print(f"File not found: {filename}")
    except UnidentifiedImageError:
        print(f"Cannot identify image file: {filename}")

with open(os.path.join(output_dir, "metadata.jsonl"), "w") as f:
    for entry in updated_metadata:
        f.write(json.dumps(entry) + "\n")

# shutil.copyfile(metadata_file, "img_resized/metadata.jsonl")