In [11]:
import pandas as pd
import os

original_csv_path = "../data/original-mineralimage5K/minerals_full.csv"

In [None]:
df = pd.read_csv(original_csv_path)

In [25]:
import pandas as pd
from collections import Counter
import ast

def analyze_column_types(df, column_name, sample_size=100):
    # Get the column
    column = df[column_name]
    
    # Take a sample if the column is large
    if len(column) > sample_size:
        sample = column.sample(sample_size, random_state=42)
    else:
        sample = column
    
    # Function to safely evaluate strings that might be Python literals
    def safe_eval(val):
        try:
            return ast.literal_eval(val)
        except:
            return val

    # Analyze types
    type_counts = Counter(type(safe_eval(val)).__name__ for val in sample if pd.notna(val))
    
    print(f"Type analysis for column '{column_name}':")
    for type_name, count in type_counts.items():
        print(f"  {type_name}: {count}")
    
    # If we find strings, let's check if they might be JSON or list-like
    if 'str' in type_counts:
        str_sample = [val for val in sample if isinstance(val, str)][:5]
        print("\nExample string values:")
        for val in str_sample:
            print(f"  {val[:50]}..." if len(val) > 50 else f"  {val}")
        
        # Check if strings might be JSON or list-like
        json_like = any(val.strip().startswith(('[', '{')) for val in str_sample)
        if json_like:
            print("\nNote: Some string values appear to be JSON or list-like. "
                  "You may want to parse these.")

# Usage
# Assuming df is your DataFrame
analyze_column_types(df, 'en_variety')

Type analysis for column 'en_variety':
  str: 4

Example string values:
  viluit
  kleiophane
  kleiophane
  adular


In [23]:
df['en_variety']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44784 entries, 0 to 44783
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               44784 non-null  object
 1   path             44784 non-null  object
 2   height           44784 non-null  int64 
 3   width            44784 non-null  int64 
 4   ru_name          44784 non-null  object
 5   ru_synonim_name  44784 non-null  object
 6   ru_variety       12997 non-null  object
 7   ru_satellites    13791 non-null  object
 8   ru_description   21248 non-null  object
 9   en_name          44784 non-null  object
 10  en_variety       339 non-null    object
 11  en_satellites    5325 non-null   object
 12  en_description   21247 non-null  object
 13  text_boxes       44784 non-null  object
 14  mineral_boxes    44784 non-null  object
dtypes: int64(2), object(13)
memory usage: 5.1+ MB


In [12]:

min_full_dict = dict()

for _, row in df.iterrows():
    min_id = row['id']
    path = row['path']
    min_en_name = row['en_name']
    min_en_var = row['en_variety']
    min_en_description = row['en_description']
    text_boxes = row['text_boxes']
    min_boxes = row['mineral_boxes']

    
    min_full_dict[min_id] = {
        'path': path,
        'name': min_en_name,
        'variety': min_en_var,
        'description': min_en_description,
        'text_boxes': text_boxes,
        'mineral_boxes': min_boxes
    }
    
    
    
    

In [13]:
for min_id, min_data in list(min_full_dict.items()):
    print(min_id)
    print(min_data)
    break

10_1
{'path': 'data/10_meteor/10_1/FMM_10_1.jpg', 'name': 'meteorite kunya-urgench', 'variety': nan, 'description': 'a fragment weighing 80.4 grams of kunya-urgench stony chondrite with 20-25% metal phases. dropped on june 20, 1998.', 'text_boxes': '[[[0.74316, 0.6229], [0.81641, 0.66565]], [[0.72266, 0.67023], [0.83398, 0.70534]]]', 'mineral_boxes': '[{"label": "a rock", "confidence": 0.326, "box": [0.17871, 0.24733, 0.64941, 0.78473]}]'}


we now create a dataset using mineral_98

In [14]:
min_98_df = pd.read_csv("../data/original-mineralimage5K/minerals_98.csv")


In [18]:
ids = min_98_df['id'].values
min_full_dict[ids[0]]

{'path': 'data/1_syst/00000/1_10/1_10.jpg',
 'name': 'graphite',
 'variety': nan,
 'description': 'grained graphite. sample size 5.5 x 3.5 x 2 cm',
 'text_boxes': '[[[0.25781, 0.59238], [0.29785, 0.61584]], [[0.24805, 0.6217], [0.2666, 0.6393]]]',
 'mineral_boxes': '[{"label": "a rock", "confidence": 0.268, "box": [0.36035, 0.31818, 0.6709, 0.69062]}, {"label": "a mineral", "confidence": 0.1, "box": [0.3623, 0.31672, 0.6709, 0.68328]}]'}

In [26]:
from PIL import Image
min_98  = {
    'image': [],
    'name': [],
    'description': [],
    'mineral_boxes': [],
}

for id in ids:
    try:
        path = os.path.join('../data/original-mineralimage5K/', min_full_dict[id]['path'])
        image = Image.open(path).convert('RGB')
    except FileNotFoundError:
        print(f"File not found for id {id} at path {path}")
        continue
    min_98['image'].append(image)
    min_98['name'].append(min_full_dict[id]['name'])
    min_98['description'].append(min_full_dict[id]['description'])
    min_98['mineral_boxes'].append(min_full_dict[id]['mineral_boxes'])

File not found for id 1_12052 at path ../data/original-mineralimage5K/data/1_syst/10000/1_12052/1_12052_gypsum_w.JPG
File not found for id 1_12149 at path ../data/original-mineralimage5K/data/1_syst/10000/1_12149/1_12149.jpg
File not found for id 1_15430 at path ../data/original-mineralimage5K/data/1_syst/10000/1_15430/1_15430_gypsum_w.JPG
File not found for id 1_15876 at path ../data/original-mineralimage5K/data/1_syst/10000/1_15876/1_15876_gypsum_w.JPG
File not found for id 1_16705 at path ../data/original-mineralimage5K/data/1_syst/10000/1_16705/1_16705_1_gold_w.jpg
File not found for id 1_17760 at path ../data/original-mineralimage5K/data/1_syst/10000/1_17760/1_17760_beryl_NM_w.jpg
File not found for id 1_17763 at path ../data/original-mineralimage5K/data/1_syst/10000/1_17763/1_17763_beryl_NM_w.jpg
File not found for id 1_17906 at path ../data/original-mineralimage5K/data/1_syst/10000/1_17906/1_17906_NM_w.jpg
File not found for id 1_17907 at path ../data/original-mineralimage5K/dat