In [101]:
import pandas as pd
import numpy as np
import os

DATA_PATH = os.path.join(os.path.dirname(os.getcwd()), "data")

df = pd.read_excel(os.path.join(DATA_PATH, "Nordic_Textile_Anatomy_Database_DdS.xlsx"), sheet_name="RMM_DK")


In [102]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def cluster_composition_by_category(df, fiber_cols=3, min_clusters=2, max_clusters=4):
    """
    Clusters textiles by fiber composition within each Category.
    
    Parameters:
    - df: DataFrame with columns:
        'Category',
        'Fibre 1', 'Fibre 1 % Range', ..., up to 'Fibre {fiber_cols}', 'Fibre {fiber_cols} % Range'
    - fiber_cols: number of fiber columns to consider (default 3)
    - min_clusters, max_clusters: range of k to try for KMeans
    
    Returns:
    - dict mapping category to summary DataFrame with columns:
        'Cluster', 'Count', and one column per fiber name giving average percentage in that cluster.
    """
    # Helper to parse percentage range to midpoint
    def parse_pct(s):
        try:
            s = str(s).replace('–', '-').replace('%', '')
            low, high = s.split('-')
            return (float(low) + float(high)) / 2
        except:
            return np.nan

    # Collect all unique fiber names from columns Fibre 1 .. Fibre fiber_cols
    fiber_names = set()
    parsed_entries = []
    for idx, row in df.iterrows():
        comp = {}
        for i in range(1, fiber_cols + 1):
            name = row.get(f'Fibre {i}')
            pct_range = row.get(f'Fibre {i} % Range')
            if pd.notna(name) and pd.notna(pct_range):
                pct = parse_pct(pct_range)
                if pd.notna(pct):
                    comp[name] = pct
                    fiber_names.add(name)
        parsed_entries.append(comp)

    all_fibers = sorted(fiber_names)
    if not all_fibers:
        print("No fiber data found. Check column names and data.")
        return {}

    # Build feature matrix: each row is normalized composition vector over all_fibers
    feature_rows = []
    indices = []
    categories = []
    for (idx, row), comp in zip(df.iterrows(), parsed_entries):
        if comp:
            vec = [comp.get(f, 0.0) for f in all_fibers]
            total = sum(vec)
            if total > 0:
                vec = [v / total for v in vec]
                feature_rows.append(vec)
                indices.append(idx)
                cat = row['Category'].strip().lower()
                categories.append(cat)
    if not feature_rows:
        print("No valid composition entries to cluster.")
        return {}

    feat_df = pd.DataFrame(feature_rows, index=indices, columns=all_fibers)
    result = {}

    # Group by category
    cat_series = pd.Series(categories, index=indices, name='Category')
    for cat, group in cat_series.groupby(cat_series):
        idxs = group.index
        X = feat_df.loc[idxs]
        n_samples = len(X)
        if n_samples < 2:
            print(f"Category '{cat}' has fewer than 2 samples, skipping.")
            continue

        # Determine best k by silhouette
        best_k = None
        best_score = -1
        for k in range(min_clusters, min(max_clusters, n_samples - 1) + 1):
            km = KMeans(n_clusters=k, random_state=0)
            labels = km.fit_predict(X)
            # Silhouette requires at least 2 clusters and less than n_samples clusters
            score = silhouette_score(X, labels)
            if score > best_score:
                best_score = score
                best_k = k

        km = KMeans(n_clusters=best_k, random_state=0).fit(X)
        labels = km.labels_
        centroids = km.cluster_centers_

        # Build summary for this category
        summary = []
        for cluster_label in range(best_k):
            mask = labels == cluster_label
            count = int(mask.sum())
            centroid = centroids[cluster_label]
            # Convert centroid to percentages summing to 100
            pct = centroid / centroid.sum() * 100
            comp_dict = {f: pct_val for f, pct_val in zip(all_fibers, pct)}
            row_summary = {'Cluster': cluster_label, 'Count': count}
            row_summary.update(comp_dict)
            summary.append(row_summary)

        summary_df = pd.DataFrame(summary).sort_values('Cluster').reset_index(drop=True)

        # drop insignificant fibers
        summary_df = summary_df.loc[:, summary_df.max() >= 1]
        cols_to_clean = summary_df.select_dtypes(include='number').columns

        # Replace values < 1 with 0
        summary_df[cols_to_clean] = summary_df[cols_to_clean].where(summary_df[cols_to_clean] >= 1, 0)

        # check to what percentage fibres add up
        meta_cols = ['Cluster', 'Count']
        fiber_cols = [col for col in summary_df.columns if col not in meta_cols]
        summary_df['Sum'] = summary_df[fiber_cols].sum(axis=1)
        summary_df = summary_df[['Sum'] + meta_cols + fiber_cols]

        result[cat] = summary_df

    return result

# Run clustering if df exists
if 'df' not in globals():
    print("Please ensure your DataFrame is named 'df' with columns 'Category', "
          "'Fibre 1'..'Fibre 3', 'Fibre 1 % Range'..'Fibre 3 % Range'.")
else:
    clusters = cluster_composition_by_category(df)
    for cat, summary_df in clusters.items():
        print(f"\nCategory: {cat}")
        display(summary_df)



Category: dresses and skirts


Unnamed: 0,Sum,Cluster,Count,Acetate,Acrylic,Cotton,Cupro,Flax/linen,Lyocell,Modal,Polyamide/nylon,Polyester,Silk,True Hemp,Viscose,Wool
0,98.84363,0,101,0.0,0.0,1.195961,1.088129,1.568284,0.0,0.0,8.361745,3.498123,0.0,0.0,83.131388,0.0
1,97.172286,1,144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,93.560133,0.0,0.0,3.612153,0.0
2,97.517588,2,119,0.0,0.0,90.79878,0.0,0.0,0.0,0.0,1.626029,5.092779,0.0,0.0,0.0,0.0
3,99.229601,3,48,2.083333,2.550758,2.574303,0.0,8.215603,17.81778,20.694699,19.087874,12.560447,8.333333,1.144802,0.0,4.166667



Category: handkerchiefs, ties, scarves, gloves and other


Unnamed: 0,Sum,Cluster,Count,Acrylic,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)",Cotton,Flax/linen,Other,Polyacrylate,Polyamide/nylon,Polyester,Polyurethane,Silk,Viscose,Wool
0,96.997597,0,211,2.931342,0.0,2.183223,0.0,0.0,0.0,1.983351,89.899681,0.0,0.0,0.0,0.0
1,98.680669,1,115,0.0,0.0,96.314634,0.0,0.0,0.0,0.0,2.366035,0.0,0.0,0.0,0.0
2,96.687029,2,179,19.799183,4.932344,0.0,1.303538,1.740917,1.033936,17.714091,4.6814,1.799686,10.2426,1.829519,31.609814



Category: overcoats and anoraks


Unnamed: 0,Sum,Cluster,Count,Acrylic,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)",Cotton,Flax/linen,Lyocell,Other,Polyacrylate,Polyamide/nylon,Polyester,Polyimide,Polyurethane,Viscose,Wool
0,97.385936,0,279,0.0,0.0,2.802936,0.0,0.0,0.0,0.0,0.0,93.468695,0.0,1.114305,0.0,0.0
1,98.462229,1,100,0.0,0.0,1.455446,0.0,0.0,0.0,0.0,92.171878,1.735896,0.0,0.0,3.09901,0.0
2,98.332154,2,136,0.0,0.0,92.495112,0.0,0.0,0.0,0.0,1.359653,4.47739,0.0,0.0,0.0,0.0
3,98.731282,3,35,1.226415,1.226415,0.0,1.004243,2.857143,11.428571,1.179402,4.407417,13.901427,2.430704,2.857143,12.597374,43.615027



Category: shirts, blouses, tops


Unnamed: 0,Sum,Cluster,Count,Acrylic,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)",Cotton,Cupro,Flax/linen,Lyocell,Other,Polyamide/nylon,Polyester,Polyurethane,Silk,Viscose,Wool
0,97.189172,0,176,0.0,0.0,2.639496,0.0,0.0,0.0,0.0,1.380967,93.168709,0.0,0.0,0.0,0.0
1,98.408673,1,196,0.0,0.0,95.470405,0.0,0.0,0.0,0.0,1.365503,1.572766,0.0,0.0,0.0,0.0
2,97.452905,2,98,15.347019,2.343379,3.828034,1.121439,10.494456,4.541321,2.669634,15.675744,3.981999,1.871688,6.122449,29.455743,0.0
3,100.0,3,35,2.543171,1.951072,2.7157,0.0,0.0,0.0,1.428571,5.532571,3.283582,0.0,1.428571,1.428571,79.68819



Category: sportswear and swimwear


Unnamed: 0,Sum,Cluster,Count,Cotton,Elastane/Spandex,Elastane/spandex,Lyocell,Other,Polyamide/nylon,Polyester,Polyethylene,Viscose,Wool
0,99.272741,0,183,0.0,2.976402,3.797684,0.0,0.0,1.091954,91.4067,0.0,0.0,0.0
1,98.57999,1,101,0.0,4.508058,4.305317,0.0,0.0,85.101488,4.665128,0.0,0.0,0.0
2,100.0,2,24,37.31118,0.0,0.0,9.476942,10.416667,9.847351,9.932923,2.999084,5.082418,14.933435



Category: suits and blazers


Unnamed: 0,Sum,Cluster,Count,Cotton,Elastane/spandex,Flax/linen,Lyocell,Polyamide/nylon,Polyester,Viscose,Wool
0,99.023586,0,52,1.046558,2.009185,0.0,0.0,0.0,75.56957,15.144423,5.25385
1,100.0,1,24,0.0,0.0,6.456271,0.0,4.876374,4.806106,0.0,83.86125
2,100.0,2,8,0.0,0.0,10.024752,0.0,0.0,10.244793,79.730455,0.0
3,100.0,3,11,58.694771,0.0,12.781278,16.786679,0.0,7.641863,0.0,4.09541



Category: sweaters and cardigans


Unnamed: 0,Sum,Cluster,Count,Acrylic,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)",Cotton,Flax/linen,Polyamide/nylon,Polyester,Viscose,Wool
0,98.482126,0,91,0.0,1.131873,5.114585,0.0,5.998992,1.650336,0.0,84.58634
1,96.77559,1,126,4.421769,0.0,87.001285,0.0,2.180021,3.172515,0.0,0.0
2,97.947338,2,83,5.792409,0.0,2.7166,0.0,3.143057,84.241494,0.0,2.053778
3,99.552665,3,167,31.279635,11.599852,1.365301,1.029809,20.913387,13.440943,13.451749,6.471989



Category: t-shirts, singlets and vests, hoodies and crewnecks


Unnamed: 0,Sum,Cluster,Count,Acrylic,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)",Cotton,Flax/linen,Lyocell,Modal,Polyamide/nylon,Polyester,Viscose,Wool
0,99.16119,0,417,0.0,0.0,92.746798,0.0,0.0,1.210912,0.0,5.203479,0.0,0.0
1,98.970628,1,92,1.359815,0.0,16.419017,0.0,0.0,0.0,1.307185,72.664705,7.219906,0.0
2,98.109442,2,53,0.0,0.0,1.225007,3.491974,9.433962,1.886792,30.072128,3.564388,48.43519,0.0
3,100.0,3,18,2.280026,3.895605,1.637765,0.0,8.57572,0.0,2.88698,5.075014,0.0,75.64889



Category: trousers and shorts


Unnamed: 0,Sum,Cluster,Count,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)",Cotton,Elastane/Spandex,Elastane/spandex,Flax/linen,Lyocell,Modal,Other,Polyamide/nylon,Polyester,Polyurethane,Silk,Viscose,Wool
0,96.951974,0,327,0.0,90.408825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.543149,0.0,0.0,0.0,0.0
1,100.0,1,45,1.441144,0.0,1.641165,1.606746,1.481481,1.064163,0.0,0.0,89.625835,3.139465,0.0,0.0,0.0,0.0
2,99.179843,2,67,0.0,2.689541,0.0,0.0,8.045487,5.593321,4.018534,1.492537,7.429902,4.5932,1.492537,1.492537,40.598995,21.733251
3,97.524719,3,155,0.0,7.144431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.720524,0.0,0.0,7.651692,2.008073



Category: underwear, socks, night clothes


Unnamed: 0,Sum,Cluster,Count,Acrylic,Cotton,Elastane/Spandex,Elastane/spandex,Flax/linen,Lyocell,Modal,Other,Polyamide/nylon,Polyester,Polypropylene,Viscose,Wool
0,96.980018,0,130,0.0,0.0,3.169775,5.734864,0.0,0.0,0.0,0.0,88.075378,0.0,0.0,0.0,0.0
1,98.989802,1,214,0.0,83.363659,0.0,0.0,0.0,0.0,0.0,1.263151,7.841908,6.521084,0.0,0.0,0.0
2,95.15702,2,84,0.0,1.731712,0.0,0.0,1.190476,2.380952,3.571429,0.0,5.973528,56.941145,0.0,23.367777,0.0
3,99.123504,3,51,9.498652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.675343,0.0,3.444478,0.0,64.505031


In [103]:
merged = []

for category, df_cat in clusters.items():
    df_cat = df_cat.copy()
    df_cat['Category'] = category  # Add category column
    merged.append(df_cat)

# Combine all into one DataFrame
merged_df = pd.concat(merged, ignore_index=True)

# Sort by Category (A-Z), then Count (descending)
merged_df = merged_df.sort_values(by=['Category', 'Count'], ascending=[True, False]).reset_index(drop=True)

cols = ['Category'] + [col for col in merged_df.columns if col != 'Category']

merged_df = merged_df[cols]
merged_df.head()


Unnamed: 0,Category,Sum,Cluster,Count,Acetate,Acrylic,Cotton,Cupro,Flax/linen,Lyocell,...,Wool,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)",Other,Polyacrylate,Polyurethane,Polyimide,Elastane/Spandex,Elastane/spandex,Polyethylene,Polypropylene
0,dresses and skirts,97.172286,1,144,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,,,,,,,,
1,dresses and skirts,97.517588,2,119,0.0,0.0,90.79878,0.0,0.0,0.0,...,0.0,,,,,,,,,
2,dresses and skirts,98.84363,0,101,0.0,0.0,1.195961,1.088129,1.568284,0.0,...,0.0,,,,,,,,,
3,dresses and skirts,99.229601,3,48,2.083333,2.550758,2.574303,0.0,8.215603,17.81778,...,4.166667,,,,,,,,,
4,"handkerchiefs, ties, scarves, gloves and other",96.997597,0,211,,2.931342,2.183223,,0.0,,...,0.0,0.0,0.0,0.0,0.0,,,,,


In [104]:
merged_df.sort_values(by="Count", ascending=False,inplace=True)
merged_df.head()

Unnamed: 0,Category,Sum,Cluster,Count,Acetate,Acrylic,Cotton,Cupro,Flax/linen,Lyocell,...,Wool,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)",Other,Polyacrylate,Polyurethane,Polyimide,Elastane/Spandex,Elastane/spandex,Polyethylene,Polypropylene
26,"t-shirts, singlets and vests, hoodies and crew...",99.16119,0,417,,0.0,92.746798,,0.0,0.0,...,0.0,0.0,,,,,,,,
30,trousers and shorts,96.951974,0,327,,,90.408825,,0.0,0.0,...,0.0,0.0,0.0,,0.0,,0.0,0.0,,
7,overcoats and anoraks,97.385936,0,279,,0.0,2.802936,,0.0,0.0,...,0.0,0.0,0.0,0.0,1.114305,0.0,,,,
34,"underwear, socks, night clothes",98.989802,1,214,,0.0,83.363659,,0.0,0.0,...,0.0,,1.263151,,,,0.0,0.0,,0.0
4,"handkerchiefs, ties, scarves, gloves and other",96.997597,0,211,,2.931342,2.183223,,0.0,,...,0.0,0.0,0.0,0.0,0.0,,,,,


In [105]:
# Select top 10 rows
top10 = merged_df.head(10).copy(deep=True)
top10 = top10.fillna(0)

# Identify fibre columns (exclude meta columns)
meta_cols = ['Category', 'Sum', 'Cluster', 'Count']
fiber_cols = [col for col in top10.columns if col not in meta_cols]

# Filter out fibre columns where the max value is less than 5%
keep_fibers = [col for col in fiber_cols if top10[col].max(skipna=True) >= 5]

# Keep only meta columns and filtered fibre columns
filtered_top10 = top10[meta_cols + keep_fibers]
#filtered_top10 = filtered_top10.drop(columns=['Cluster'])

# Add lifetime as another row based on WRAP 2022 data
# Rewrite lifetime_map as min/max tuple values
lifetime_map_minmax = {
    "dresses and skirts": (4.2, 4.9),
    "handkerchiefs, ties, scarves, gloves and other": (4.3, 4.3),
    "overcoats and anoraks": (5.4, 6.3),
    "shirts, blouses, tops": (4.1, 4.8),
    "sportswear and swimwear": (2.6, 4.4),
    "suits and blazers": (4.1, 6.1),
    "sweaters and cardigans": (4.0, 4.8),
    "t-shirts, singlets and vests, hoodies and crewnecks": (4.0, 4.0),
    "trousers and shorts": (3.8, 4.8),
    "underwear, socks, night clothes": (2.6, 4.4)
}

# Map to new columns
filtered_top10['Lifetime Min'] = filtered_top10['Category'].map(lambda x: lifetime_map_minmax.get(x, (np.nan, np.nan))[0])
filtered_top10['Lifetime Max'] = filtered_top10['Category'].map(lambda x: lifetime_map_minmax.get(x, (np.nan, np.nan))[1])
filtered_top10 = filtered_top10[['Category'] + ['Lifetime Min'] + ['Lifetime Max']+ ['Count'] + ['Sum'] + keep_fibers]
filtered_top10 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_top10['Lifetime Min'] = filtered_top10['Category'].map(lambda x: lifetime_map_minmax.get(x, (np.nan, np.nan))[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_top10['Lifetime Max'] = filtered_top10['Category'].map(lambda x: lifetime_map_minmax.get(x, (np.nan, np.nan))[1])


Unnamed: 0,Category,Lifetime Min,Lifetime Max,Count,Sum,Acrylic,Cotton,Polyamide/nylon,Polyester,Silk,Viscose,Wool,"Animal hair (alpaca, llama, camel, kashmir goat, angora goat, angora rabbit)"
26,"t-shirts, singlets and vests, hoodies and crew...",4.0,4.0,417,99.16119,0.0,92.746798,0.0,5.203479,0.0,0.0,0.0,0.0
30,trousers and shorts,3.8,4.8,327,96.951974,0.0,90.408825,0.0,6.543149,0.0,0.0,0.0,0.0
7,overcoats and anoraks,5.4,6.3,279,97.385936,0.0,2.802936,0.0,93.468695,0.0,0.0,0.0,0.0
34,"underwear, socks, night clothes",2.6,4.4,214,98.989802,0.0,83.363659,7.841908,6.521084,0.0,0.0,0.0,0.0
4,"handkerchiefs, ties, scarves, gloves and other",4.3,4.3,211,96.997597,2.931342,2.183223,1.983351,89.899681,0.0,0.0,0.0,0.0
11,"shirts, blouses, tops",4.1,4.8,196,98.408673,0.0,95.470405,1.365503,1.572766,0.0,0.0,0.0,0.0
15,sportswear and swimwear,2.6,4.4,183,99.272741,0.0,0.0,1.091954,91.4067,0.0,0.0,0.0,0.0
5,"handkerchiefs, ties, scarves, gloves and other",4.3,4.3,179,96.687029,19.799183,0.0,17.714091,4.6814,10.2426,1.829519,31.609814,4.932344
12,"shirts, blouses, tops",4.1,4.8,176,97.189172,0.0,2.639496,1.380967,93.168709,0.0,0.0,0.0,0.0
22,sweaters and cardigans,4.0,4.8,167,99.552665,31.279635,1.365301,20.913387,13.440943,0.0,13.451749,6.471989,11.599852


In [106]:
# Export merged_df as an Excel file
output_path = os.path.join(DATA_PATH, "clustered_fiber_composition.xlsx")
filtered_top10.to_excel(output_path, index=False)