In [None]:
#pip install pydantic

Collecting pydantic
  Downloading pydantic-2.11.1-py3-none-any.whl (442 kB)
                                              0.0/442.6 kB ? eta -:--:--
     ---                                   41.0/442.6 kB 653.6 kB/s eta 0:00:01
     -------                               92.2/442.6 kB 871.5 kB/s eta 0:00:01
     ------------                           143.4/442.6 kB 1.1 MB/s eta 0:00:01
     ------------                           143.4/442.6 kB 1.1 MB/s eta 0:00:01
     ---------------------                  256.0/442.6 kB 1.1 MB/s eta 0:00:01
     -----------------------------          337.9/442.6 kB 1.2 MB/s eta 0:00:01
     -----------------------------------    409.6/442.6 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 442.6/442.6 kB 1.2 MB/s eta 0:00:00
Collecting annotated-types>=0.6.0 (from pydantic)
  Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)
Collecting pydantic-core==2.33.0 (from pydantic)
  Downloading pydantic_core-2.33.0-cp311-cp311-win_


[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


##### The logic behind the three classes

* Materiel Class: Represents a single material component in a garment construction

    - fabric_type: Optional technical specification (e.g "GORE-TEX", CORDURA")
    - name: Base material name (e.g. "cotton", "polyster")
    - percentage: Composition percentage in the construction element

* ConstructionElement Class: Represents a distinct part of a garment's construction

    - name: Component identifier (e.g., "Main", "Lining", "Pocket lining")

    - colors: List of color codes associated with this element

    - materials: List of Material objects making up this element

    - weight: Weight specification (e.g., "220 g/m²")

* Garment Class: Represents a complete garment product

    - code: Unique product identifier

    - category: Product type (e.g., "JACKET", "TSHIRT")

    - construction: List of all construction elements in the garment

In [None]:
import csv
import re
from typing import List, Optional
from pydantic import BaseModel, Field, ValidationError

class Material(BaseModel):
    fabric_type: Optional[str] = None
    name: str
    percentage: int

class ConstructionElement(BaseModel):
    name: str
    colors: List[str] = Field(default_factory=list)
    materials: List[Material]
    weight: str

class Garment(BaseModel):
    code: str
    category: str
    construction: List[ConstructionElement]

def clean_text(text: str) -> str:
    """Enhanced cleaning with comprehensive weight normalization"""
    replacements = {
        "Â®": "®", "Ã©": "é", "Â²": "²", 
        "gr": "g", "Gr": "g", "GR": "g",
        "CORDURAÂ®": "CORDURA®",
        "REPREVEÂ®": "REPREVE®",
        "g/m2": "g/m²"  # Add explicit normalization for g/m2
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    text = re.sub(r'\s*%\s*', '%', text)
    return re.sub(r'\s+', ' ', text).strip()

def parse_construction_details(detail_text: str) -> List[dict]:
    """Parse construction details with enhanced color and element grouping"""
    elements = []
    detail_text = clean_text(detail_text)
    
    # Split segments while preserving color contexts
    segments = re.split(r'(?<!\bCol\.)(?<!\bCol)\s*[.;](?!\s*\d)', detail_text)
    
    current_colors = []
    
    for segment in segments:
        segment = segment.strip()
        if not segment:
            continue

        # Extract colors first
        colors, cleaned_segment = parse_colors(segment)
        current_colors = colors if colors else current_colors

        # Handle element name and details
        if ':' in cleaned_segment:
            name_part, detail_part = cleaned_segment.split(':', 1)
            name = name_part.strip()
            details = detail_part.strip()
        else:
            name = "Main"
            details = cleaned_segment

        # Extract weight and materials
        materials, weight = parse_materials_and_weight(details)

        if materials:
            elements.append({
                "name": name,
                "materials": [m.model_dump() for m in materials],
                "weight": weight,
                "colors": current_colors.copy()
            })
            current_colors = []

    return elements

def parse_materials_and_weight(material_text: str) -> tuple[List[Material], str]:
    """Improved material parser that extracts weight from material descriptions"""
    materials = []
    weight = ""
    
    # First check for separate weight specification in the text
    weight_match = re.search(r'(\d+\s*g(?:/m²)?)', material_text)
    if weight_match:
        weight = weight_match.group(1)
        # Remove standalone weight from material text
        material_text = material_text.replace(weight_match.group(0), '').strip(' ,')
    
    # Detect and extract fabric type prefix
    fabric_type = None
    fabric_match = re.match(r'^([A-Za-z®.\-\s]+?)\s(?=\d+%)', material_text)
    if fabric_match:
        fabric_type = fabric_match.group(1).strip()
        material_text = material_text[len(fabric_type):].strip()

    # Split materials while preserving technical specifications
    parts = re.split(r',\s*(?=\d+%)|\s+(?=\d+%)', material_text)
    
    for part in parts:
        part = part.strip()
        if not part:
            continue
            
        # Skip if this part is just a weight specification
        if re.match(r'^\d+\s*g', part, re.IGNORECASE) and len(part.split()) <= 2:
            continue

        match = re.match(r'(\d+)%\s*(.*)', part)
        if match:
            percentage = int(match.group(1))
            name = match.group(2).strip().lower()
            
            # Extract weight from material name if present
            material_weight_match = re.search(r'(\d+\s*g(?:/m²)?)', name)
            if material_weight_match:
                if not weight:  # Only use if we don't already have a weight
                    weight = material_weight_match.group(1)
                # Remove weight from material name
                name = name.replace(material_weight_match.group(0), '').strip(' ,')
            
            # Handle specific location mentions in the weight
            body_sleeves_match = re.search(r'in\s+body\s+and\s+sleeves', name)
            if body_sleeves_match:
                location_info = body_sleeves_match.group(0)
                name = name.replace(location_info, '').strip()
                # Attach the location info to the weight instead
                if weight:
                    weight += f" {location_info}"
            
            # Handle nested materials
            if '(' in name:
                name, nested = name.split('(', 1)
                nested = nested.rstrip(')').strip()
                nested_match = re.match(r'(\d+)%\s*(.*)', nested)
                if nested_match:
                    materials.append(Material(
                        name=nested_match.group(2).strip(),
                        percentage=int(nested_match.group(1)),
                        fabric_type=name.strip()
                    ))

            materials.append(Material(
                name=name,
                percentage=percentage,
                fabric_type=fabric_type
            ))

    return materials, weight

def parse_colors(segment: str) -> tuple[List[str], str]:
    """Enhanced color extraction with plural and abbreviated forms"""
    color_pattern = r'(?i)(?:Colou?rs?|Col\.?)(?:s|)\s*([\d\s,and]+?)(?=\s*\d+%|:|;|\.)'
    match = re.search(color_pattern, segment)
    if not match:
        return [], segment
    
    color_str = match.group(1).strip()
    color_str = re.sub(r'[^\d\s,and]+$', '', color_str)
    colors = [c.strip() for c in re.split(r',|\s+and\s+', color_str) if c.strip()]
    
    cleaned_segment = re.sub(color_pattern, '', segment, flags=re.IGNORECASE)
    return colors, cleaned_segment.strip(' :;,.')

def parse_garments_csv(file_path: str) -> List[Garment]:
    garments = []
    with open(file_path, 'r', encoding='utf-8-sig') as f:
        reader = csv.reader(f)
        for row in reader:
            if len(row) < 3:
                continue
                
            code = row[0].strip()
            category = row[1].strip()
            details = clean_text(row[2])
            
            try:
                construction = parse_construction_details(details)
                garment = Garment.model_validate({
                    "code": code,
                    "category": category,
                    "construction": construction
                })
                garments.append(garment)
            except ValidationError as e:
                print(f"Error parsing row {row}: {e}")

    return garments



In [31]:


# Path to your CSV file
file_path = r"C:\Users\HP\Downloads\care_labels.csv"
garments = parse_garments_csv(file_path)

l = list()
# Display parsed garments
for garment in garments:
    print(garment)
    l.append(garment)


code='product_id' category='product_category' construction=[]
code='#113' category='PANTS' construction=[ConstructionElement(name='Main', colors=[], materials=[Material(fabric_type=None, name='cotton', percentage=40), Material(fabric_type=None, name='polyester', percentage=60)], weight='290 g/m²'), ConstructionElement(name='Contrast', colors=[], materials=[Material(fabric_type=None, name='cotton', percentage=53), Material(fabric_type=None, name='polyester', percentage=47)], weight='290 g/m²'), ConstructionElement(name='Reinforcement Knee', colors=[], materials=[Material(fabric_type=None, name='cordura®-polyamide', percentage=100)], weight='205 g/m²')]
code='#212' category='PANTS' construction=[ConstructionElement(name='Main', colors=[], materials=[Material(fabric_type=None, name='cotton', percentage=52), Material(fabric_type=None, name='polyamide', percentage=48)], weight='240 g/m²'), ConstructionElement(name='Reinforcement', colors=[], materials=[Material(fabric_type=None, name='cordu

In [None]:
#Doublon #1100 et #1102
#code='#7512' category='SWEATER/HOODIE' construction=[ConstructionElement(name='82%cotton, 18%polyester, 280 g/m². 2X2 Rib', colors=['0400', '2800'], materials=[Material(fabric_type=None, name='cotton', percentage=97), Material(fabric_type=None, name='elastane', percentage=3)], weight='380 g/m²')]


In [183]:
parse_construction_details('"Main: 100% GORE-TEXÂ® polyester 140 g/mÂ². Contrast: GORE-TEXÂ® 44% polyester 37% polyamide 19% elastane 215 g/mÂ². Cuff stretch: 90% polyester 10% elastane, 253 g/mÂ².  Lining: 100% solution dyed polyamide 65 g/mÂ². Insulation: 50% 37.5Â® polyester, 35% REPREVEÂ® recycled polyester, 15% polyester, 120 g/mÂ². Pocket lining: 100% polyester, 215 g/mÂ². ')

[{'name': '"Main',
  'materials': [{'fabric_type': None,
    'name': 'gore-tex® polyester',
    'percentage': 100}],
  'weight': '140 g/m²',
  'colors': []},
 {'name': 'Contrast',
  'materials': [{'fabric_type': 'GORE-TEX®',
    'name': 'polyester',
    'percentage': 44},
   {'fabric_type': 'GORE-TEX®', 'name': 'polyamide', 'percentage': 37},
   {'fabric_type': 'GORE-TEX®', 'name': 'elastane', 'percentage': 19}],
  'weight': '215 g/m²',
  'colors': []},
 {'name': 'Cuff stretch',
  'materials': [{'fabric_type': None, 'name': 'polyester', 'percentage': 90},
   {'fabric_type': None, 'name': 'elastane', 'percentage': 10}],
  'weight': '253 g/m²',
  'colors': []},
 {'name': 'Lining',
  'materials': [{'fabric_type': None,
    'name': 'solution dyed polyamide',
    'percentage': 100}],
  'weight': '65 g/m²',
  'colors': []},
 {'name': 'Insulation',
  'materials': [{'fabric_type': None,
    'name': '37.5® polyester',
    'percentage': 50},
   {'fabric_type': None,
    'name': 'repreve® recycle

In [13]:
import pandas as pd

# Let's assume 'garments' is the list of Garment objects parsed from the CSV,
# and each garment has a 'code', 'category', and a list 'construction' of ConstructionElement objects.
# Each ConstructionElement has 'name', 'materials' (list of strings), and 'weight'.

# Collect all possible construction element names across garments so that we know what columns to expect.
all_elements = set()
for garment in garments:
    for element in garment.construction:
        all_elements.add(element.name)

# Prepare rows for the DataFrame
rows = []
for garment in garments:
    # Start with code and category.
    row = {"code": garment.code, "category": garment.category}
    # For each construction element (like 'Main', 'Reinforcement', etc.), we add two columns:
    # one for the materials (joined into a single string) and one for the weight.
    # Initialize columns for all possible construction elements to empty strings.
    for element_name in all_elements:
        row[f"{element_name}_materials"] = ""
        row[f"{element_name}_weight"] = ""
    
    # Fill in the available construction element details.
    for element in garment.construction:
        row[f"{element.name}_materials"] = ", ".join(element.materials)
        row[f"{element.name}_weight"] = element.weight
    rows.append(row)

# Create the DataFrame.
df = pd.DataFrame(rows)
df

Unnamed: 0,code,category,Reinforcement 1; knee pad_materials,Reinforcement 1; knee pad_weight,Contrast main_materials,Contrast main_weight,Color_materials,Color_weight,Colour 3400_materials,Colour 3400_weight,...,Mesh_materials,Mesh_weight,Insulation_materials,Insulation_weight,Pocket lining_materials,Pocket lining_weight,"Colour 0455, 0466 / Main_materials","Colour 0455, 0466 / Main_weight",Lining_materials,Lining_weight
0,product_id,product_category,,,,,,,,,...,,,,,,,,,,
1,#113,PANTS,,,,,,,,,...,,,,,,,,,,
2,#212,PANTS,,,,,,,,,...,,,,,,,,,,
3,#213,PANTS,,,,,,,,,...,,,,,,,,,,
4,#214,PANTS,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,#9794,PANTS,,,,,,,,,...,,,,,,,,,,
570,#9795,ACCESSORY/PHONE-CASE,,,,,,,,,...,,,,,,,,,,
571,#9796,ACCESSORY/PHONE-CASE,,,,,,,,,...,,,,,,,,,,
572,#9797,PANTS,,,,,,,,,...,,,,,,,,,,


In [19]:
a = list(df.columns)
(a)

['code',
 'category',
 'Reinforcement 1; knee pad_materials',
 'Reinforcement 1; knee pad_weight',
 'Contrast main_materials',
 'Contrast main_weight',
 'Color_materials',
 'Color_weight',
 'Colour 3400_materials',
 'Colour 3400_weight',
 'Main Material_materials',
 'Main Material_weight',
 'Palm_materials',
 'Palm_weight',
 'Colors_materials',
 'Colors_weight',
 'Contrast fabric_materials',
 'Contrast fabric_weight',
 'Coating_materials',
 'Coating_weight',
 'Contrast 2 (armpit)_materials',
 'Contrast 2 (armpit)_weight',
 'Ripstop_materials',
 'Ripstop_weight',
 'Reinforcement 2; pockets_materials',
 'Reinforcement 2; pockets_weight',
 'Col 0400, 0900, 5800 and 9500_materials',
 'Col 0400, 0900, 5800 and 9500_weight',
 'Colours 0400, 0600, 4000, 9500_materials',
 'Colours 0400, 0600, 4000, 9500_weight',
 'Colour 0400, 9500_materials',
 'Colour 0400, 9500_weight',
 'Fill power 330\nLining_materials',
 'Fill power 330\nLining_weight',
 'Col 0400 and 2800_materials',
 'Col 0400 and 2800_