In [38]:
import pandas as pd
import os

# Amount of models with 1 and 2 images

In [46]:
dirpath = 'data/'

# Load the CSV file into a DataFrame
product_data = pd.read_csv(dirpath+'test_data.csv')

# Get the unique values from the 'cod_modelo_color' column
unique_values = product_data['cod_modelo_color'].unique()

# Path to the folder containing the images
image_folder = '../datathon-fme-mango/archive/images/images/'

# Get a list of all files in the folder
image_files = os.listdir(image_folder)

# Initialize a list to store results
data = []

# Loop through each unique value and count matching files
for value in unique_values:
    count = sum(1 for file in image_files if value in file)
    data.append({'cod_modelo_color': value, 'file_count': count})

# Create a new DataFrame with the results
result_df = pd.DataFrame(data)

In [47]:
result_df['file_count'].value_counts()

file_count
1    6529
Name: count, dtype: int64

# Amount of unique values per attribute

In [41]:
dirpath = 'data/'

# Load the dataframe
attribute_data = pd.read_csv(dirpath+'attribute_data.csv')

# Group by 'attribute_name' and count the unique values of 'des_value'
result = (
    attribute_data.groupby('attribute_name')['des_value']
    .nunique()
    .reset_index(name='unique_des_value_count')
)
result2 = (
    attribute_data.groupby('attribute_name')['cod_value']
    .nunique()
    .reset_index(name='unique_des_value_count')
)

# Display the results
print(result)
print(result2)

        attribute_name  unique_des_value_count
0     cane_height_type                       6
1    closure_placement                       6
2      heel_shape_type                      11
3       knit_structure                       5
4          length_type                      12
5      neck_lapel_type                      33
6      silhouette_type                      33
7   sleeve_length_type                       6
8          toecap_type                       4
9           waist_type                       4
10     woven_structure                       4
        attribute_name  unique_des_value_count
0     cane_height_type                       3
1    closure_placement                       6
2      heel_shape_type                      11
3       knit_structure                       5
4          length_type                      12
5      neck_lapel_type                      33
6      silhouette_type                      33
7   sleeve_length_type                       6
8          to

# Which are those unique values

In [42]:
dirpath = 'data/'
# Load the dataframe
attribute_data = pd.read_csv(dirpath+'attribute_data.csv')

# Group by 'attribute_name' and list unique values of 'des_value'
result = (
    attribute_data.groupby('attribute_name')['des_value']
    .apply(lambda x: list(x.unique()))
    .reset_index(name='unique_des_values')
)

# Display the results
print(result)

        attribute_name                                  unique_des_values
0     cane_height_type  [Cuña abotinada, Alta, Bloque, Cuña, Baja, Media]
1    closure_placement  [Cierre Delantero, Sin cierre, Cuello, Lateral...
2      heel_shape_type  [Kitten, Plano, Bloque, Embudo, Rectangular, P...
3       knit_structure  [Punto fino, Punto medio, Punto grueso, UNKNOW...
4          length_type  [Crop, Medio, Largo, Standard, Corto, Midi, Mi...
5      neck_lapel_type  [Redondo, Pico, Regular, Caja, Chimenea, Perki...
6      silhouette_type  [Slim, Oversize, Recto, Regular, Evase, Slouch...
7   sleeve_length_type  [Larga, Corta, Sin Manga, Tirante Ancho, Tres ...
8          toecap_type            [Redonda, Abierta, Con punta, Cuadrada]
9           waist_type  [Ajustable/Goma, High Waist, Regular Waist, Lo...
10     woven_structure                  [Ligero, Medio, Pesado, Elástico]


In [43]:
# Optionally, save the results to a CSV file
result.to_csv('unique_des_values_by_attribute.csv', index=False)

# Check attributes for each category

In [44]:
import pandas as pd
dirpath = 'data/'

# Load the CSV files
product_data = pd.read_csv(dirpath+'product_data.csv')
attribute_data = pd.read_csv(dirpath+'attribute_data.csv')

# Merge the DataFrames on the cod_modelo_color column
merged_data = pd.merge(product_data, attribute_data, on='cod_modelo_color', how='inner')

division_method = 'des_product_type'

# Group by product category and list unique attribute names
attributes_per_category = merged_data.groupby(division_method)['attribute_name'].unique()

# Check if all pieces of cloth in each category have the same attributes
category_consistency = {}
for category, group in merged_data.groupby(division_method):
    # Get a set of attributes for each cod_modelo_color
    attributes_per_model = group.groupby('cod_modelo_color')['attribute_name'].apply(lambda x: tuple(sorted(x.unique())))
    
    print("#########################")
    print(category)    
    print("#########################")
    print(attributes_per_model.unique())
    # Check if all sets are equal
    category_consistency[category] = len(attributes_per_model.unique()) == 1

# Print results
print("Attributes per Product Category:")
print(attributes_per_category)

print("\nCategory Consistency:")
for category, consistent in category_consistency.items():
    status = "consistent" if consistent else "not consistent"
    print(f"{category}: {status}")


#########################
Ankle Boots
#########################
[('cane_height_type',) ('heel_shape_type',)
 ('cane_height_type', 'heel_shape_type', 'toecap_type')
 ('heel_shape_type', 'toecap_type')
 ('cane_height_type', 'heel_shape_type')
 ('cane_height_type', 'toecap_type')]
#########################
Beach Shoes
#########################
[('heel_shape_type', 'toecap_type')]
#########################
Bermudas
#########################
[('silhouette_type',) ('waist_type',)
 ('silhouette_type', 'waist_type', 'woven_structure')
 ('knit_structure', 'silhouette_type', 'waist_type', 'woven_structure')
 ('silhouette_type', 'waist_type')
 ('knit_structure', 'silhouette_type', 'waist_type')
 ('waist_type', 'woven_structure') ('silhouette_type', 'woven_structure')]
#########################
Blazer
#########################
[('neck_lapel_type', 'silhouette_type') ('neck_lapel_type',)
 ('neck_lapel_type', 'silhouette_type', 'woven_structure')
 ('closure_placement', 'silhouette_type')
 ('length_t

# Reshape labels csv

In [49]:
# Load the CSV file
df = pd.read_csv('data/attribute_data.csv')

# Pivot the DataFrame to create the desired structure
result_df = df.pivot(index='cod_modelo_color', columns='attribute_name', values='des_value')

# Replace missing values with "INVALID"
result_df = result_df.fillna("INVALID")

# Reset the index to make 'cod_modelo_color' a column again
result_df = result_df.reset_index()

# Display the resulting DataFrame
print(result_df)

# Optionally save the result to a new CSV file
result_df.to_csv('transformed_attribute_data.csv', index=False)

attribute_name cod_modelo_color cane_height_type closure_placement  \
0                    81_1034451          INVALID           INVALID   
1                    81_1034525          INVALID           INVALID   
2                    81_1035318          INVALID           INVALID   
3                    81_1035321          INVALID           INVALID   
4                    81_1035361          INVALID           INVALID   
...                         ...              ...               ...   
33438                86_9893076          INVALID           INVALID   
33439                86_9893077          INVALID           INVALID   
33440                86_9893868          INVALID           INVALID   
33441                86_9893898          INVALID  Cierre Delantero   
33442                86_9893899          INVALID  Cierre Delantero   

attribute_name heel_shape_type knit_structure length_type   neck_lapel_type  \
0                      INVALID        INVALID     INVALID  Hawaiano/Bowling   
1

In [51]:
# Load the CSV file
df = pd.read_csv('data/test_data.csv')

# Create a new DataFrame with 'test_id' and a new 'des_value' column filled with "INVALID"
result_df = df[['test_id']].copy()  # Copy only the 'test_id' column
result_df['des_value'] = "INVALID"  # Add the 'des_value' column filled with "INVALID"

# Display the resulting DataFrame
print(result_df)

# Optionally save the result to a new CSV file
result_df.to_csv('test_data_with_invalid.csv', index=False)

                            test_id des_value
0      88_49711373_cane_height_type   INVALID
1      88_49718802_cane_height_type   INVALID
2      88_49709572_cane_height_type   INVALID
3      88_49722701_cane_height_type   INVALID
4      88_49724926_cane_height_type   INVALID
...                             ...       ...
71814    88_49727540_knit_structure   INVALID
71815    88_49733648_knit_structure   INVALID
71816    88_49735572_knit_structure   INVALID
71817    88_49713624_knit_structure   INVALID
71818    88_49726160_knit_structure   INVALID

[71819 rows x 2 columns]


In [3]:
import pandas as pd

# Load the CSV file
file_path = "data/product_data.csv"  # Update this to the path of your file
data = pd.read_csv(file_path)

# Columns to exclude
excluded_columns = ['cod_modelo_color', 'des_filename']

# Prepare the data for the new CSV format
metadata_list = []
value_list = []

for column in data.columns:
    if column not in excluded_columns:
        unique_values = data[column].dropna().unique().tolist()  # Drop NaN values and convert to a list
        metadata_list.append(column)
        value_list.append(unique_values)

# Create a DataFrame with Metadata and Value columns
final_df = pd.DataFrame({
    "Metadata": metadata_list,
    "Value": [", ".join(map(str, values)) for values in value_list]  # Convert lists to strings
})

# Save the DataFrame to a CSV file
output_file = "unique_metadata_values_combined.csv"  # Adjust the file name and path as needed
final_df.to_csv(output_file, index=False)

print(f"Combined metadata and unique values saved to: {output_file}")

Combined metadata and unique values saved to: unique_metadata_values_combined.csv
