### Excel file manipulation

Importing the given excel file and then replacing missing with nan

In [1]:
import pandas as pd
import numpy as np

# Read the xlsx file
df = pd.read_excel('lables.xlsx')

# Select only the first 10 rows of the dataframe
df = df.head(10)
df.info()

# Replace all occurrences of 'x' with NaN
df.replace('x', np.nan, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ID                    10 non-null     int64 
 1   Altro_ID              9 non-null      object
 2   Nome_verbatim         10 non-null     object
 3   Nome_floritaly        10 non-null     object
 4   Località di raccolta  10 non-null     object
 5   Altitudine            9 non-null      object
 6   Giorno                10 non-null     object
 7   Mese                  10 non-null     object
 8   Anno                  10 non-null     object
 9   Legit                 0 non-null      object
 10  Determinavit          0 non-null      object
 11  Note                  2 non-null      object
 12  Note sui cartellini   2 non-null      object
 13  Dati estratti da AI   9 non-null      object
dtypes: int64(1), object(13)
memory usage: 1.2+ KB


In [6]:
import json

# define metadata list
metadata_list = []

# loop through rows of dataframe
for index, row in df.iterrows():
    # Do it only for the ones who have null note and others to avoid wierd things
    if pd.isnull(row['Note']) and pd.isnull(row['Determinavit']) and pd.isnull(row['Legit']):
        # create dictionary with metadata for this row
        metadata_dict = {
            "Nome_verbatim": row['Nome_verbatim'],
            "Nome_floritaly": row['Nome_floritaly'],
            "Località di raccolta": row['Località di raccolta'],
            "Altitudine": row['Altitudine'],
            "Giorno": row['Giorno'],
            "Mese": row['Mese'],
            "Anno": row['Anno'],
        }
        # create dictionary with "file_name" and "text" keys
        metadata_list.append({
            # "file_name": f'{row['ID']}.jpg',
            "file_name": f"{str(row['ID']).zfill(5)}.jpg",
            "text": json.dumps(metadata_dict)
        })

# write jsonline file
with open('metadata.jsonl', 'w') as outfile:
    for entry in metadata_list:
        json.dump(entry, outfile)
        outfile.write('\n')