In [None]:
import os.path
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from PIL import Image
from matplotlib.colors import LinearSegmentedColormap
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import pdist, squareform
from matplotlib import gridspec
from scipy import stats

# Set plot style and size
sns.set_theme(style="darkgrid")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Color Clustering Analysis
# 1. Load and explore the data
def load_data(file_path='downscaled_thumbnails.csv'):
    """Load the color profile data"""
    df = pd.read_csv(file_path)
    print(f"Dataset shape: {df.shape}")
    return df

df = load_data()

# # Display the first few rows
# print("\nFirst 5 rows:")
# display(df.head())

# # Basic statistics
# print("\nBasic statistics:")
# display(df.describe())

In [None]:
# 2. Visualize the distribution of colors across all images
def plot_color_distribution(df):
    """Plot the average distribution of colors across all images"""
    # Drop the 'Name' column for calculations
    colors_only = df.drop('Name', axis=1)
    
    # Calculate the mean percentage for each color
    color_means = colors_only.mean().sort_values(ascending=False)
    
    # Create a bar plot
    plt.figure(figsize=(10, 6))
    bars = plt.bar(color_means.index, color_means.values)
    
    # Color each bar with its corresponding color
    for i, bar in enumerate(bars):
        color_name = bar.get_x()
        if color_name in ['Yellow', 'Orange', 'Red', 'Violet', 'Blue', 'Green', 'Brown', 'Black', 'White']:
            bar.set_color(color_name.lower())
        else:
            bar.set_color('gray')
    
    plt.title('Average Color Distribution Across All Images')
    plt.ylabel('Percentage (%)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

plot_color_distribution(df)


In [None]:
# 3. Prepare data for clustering
def prepare_data_for_clustering(df):
    """Prepare the data for clustering by separating features and scaling"""
    # Extract features (color percentages) and image names
    X = df.drop('Name', axis=1).values
    image_names = df['Name'].values
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, image_names, scaler

X_scaled, image_names, scaler = prepare_data_for_clustering(df)

In [None]:
# 4. Find optimal number of clusters using silhouette score
def find_optimal_clusters(X, max_k=10):
    """Determine the optimal number of clusters using silhouette score"""
    silhouette_scores = []
    
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X)
        score = silhouette_score(X, kmeans.labels_)
        silhouette_scores.append(score)
        print(f"Silhouette score for k={k}: {score}")
    
    # Plot silhouette scores
    plt.figure(figsize=(10, 6))
    plt.plot(range(2, max_k + 1), silhouette_scores, 'bo-')
    plt.title('Silhouette Score Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.show()
    
    # Return the optimal k (adding 2 because we start at k=2)
    return silhouette_scores.index(max(silhouette_scores)) + 2

# Find optimal number of clusters
optimal_k = find_optimal_clusters(X_scaled)
print(f"Optimal number of clusters: {optimal_k}")

In [None]:
# 5. Perform clustering with optimal k
def perform_clustering(X, k):
    """Perform KMeans clustering"""
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X)
    return kmeans, labels

kmeans, labels = perform_clustering(X_scaled, optimal_k)

In [None]:
def visualize_clusters_pca_with_centroids(X, labels, image_names):
    """Visualize clusters in 2D using PCA with a side-by-side plot showing centroids"""
    # Apply PCA to reduce to 2 dimensions for visualization
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    # Create DataFrame for easier plotting
    pca_df = pd.DataFrame({'PCA1': X_pca[:, 0], 'PCA2': X_pca[:, 1], 
                          'Cluster': labels, 'Image': image_names})
    
    # Calculate cluster centroids in original space
    n_clusters = len(np.unique(labels))
    centroids_orig = np.zeros((n_clusters, X.shape[1]))
    
    for i in range(n_clusters):
        centroids_orig[i] = np.mean(X[labels == i], axis=0)
    
    # Project centroids onto PCA space
    centroids_pca = pca.transform(centroids_orig)
    
    # Create a figure with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Left plot: All data points
    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=pca_df, palette='viridis', s=80, ax=ax1)
    ax1.set_title('All Data Points in PCA Space')
    ax1.legend(title='Cluster')
    
    # Right plot: Centroids only
    # First plot all points with reduced opacity for context
    sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=pca_df, palette='viridis', 
                   s=30, alpha=0.2, legend=False, ax=ax2)
    
    # Then plot centroids as larger points with labels
    for i in range(n_clusters):
        ax2.scatter(centroids_pca[i, 0], centroids_pca[i, 1], s=200, c=f'C{i}', 
                   marker='*', edgecolors='black', linewidth=1.5, label=f'Centroid {i}')
        ax2.annotate(f'Cluster {i}', (centroids_pca[i, 0], centroids_pca[i, 1]),
                   xytext=(10, 5), textcoords='offset points', fontsize=12, fontweight='bold')
    
    ax2.set_title('Cluster Centroids in PCA Space')
    ax2.legend(title='Centroids', loc='upper right')
    
    # Add information about variance explained
    explained_var = pca.explained_variance_ratio_
    fig.suptitle(f'Cluster Visualization (PCA1: {explained_var[0]:.1%} variance, PCA2: {explained_var[1]:.1%} variance)',
                fontsize=16)
    
    plt.tight_layout()
    plt.show()
    
    return pca, X_pca, centroids_pca

# Call the updated function
pca, X_pca, centroids_pca = visualize_clusters_pca_with_centroids(X_scaled, labels, image_names)

In [None]:
# 7. Analyze cluster characteristics
def analyze_clusters(df, labels, k):
    """Analyze the characteristics of each cluster"""
    # Add cluster labels to the original DataFrame
    df_clustered = df.copy()
    df_clustered['Cluster'] = labels
    
    # Calculate mean color percentages for each cluster
    numeric_cols = df_clustered.select_dtypes(include=['number']).columns
    numeric_cols = [col for col in numeric_cols if col != 'Cluster']
    cluster_means = df_clustered.groupby('Cluster')[numeric_cols].mean()
    
    # Arrange plots in a grid instead of vertical stack for better visibility
    cols = 2  # Number of columns in the grid
    rows = math.ceil(k / cols)  # Calculate required rows
    
    # Create larger figure to accommodate the grid
    plt.figure(figsize=(20, 5 * rows))
    
    for i in range(k):
        # Create subplot in grid layout
        plt.subplot(rows, cols, i+1)
        
        cluster_profile = cluster_means.iloc[i]
        
        # Set bar width explicitly for better visibility
        width = 0.7
        bar_positions = np.arange(len(cluster_profile.index))
        bars = plt.bar(bar_positions, cluster_profile.values, width=width)
        
        # Color each bar appropriately
        for bar, color_name in zip(bars, cluster_profile.index):
            if color_name.lower() in ['yellow', 'orange', 'red', 'violet', 'blue', 'green', 'brown', 'black', 'white']:
                bar.set_color(color_name.lower())
            else:
                bar.set_color('gray')
        
        # Add value labels on top of each bar for clarity
        for bar in bars:
            height = bar.get_height()
            if height > 3:  # Only label bars with significant values
                plt.text(
                    bar.get_x() + bar.get_width()/2.,
                    height + 0.5,
                    f'{height:.1f}%',
                    ha='center', 
                    va='bottom', 
                    rotation=0,
                    fontsize=9
                )
                
        plt.title(f'Cluster {i} (n={sum(labels == i)})', fontsize=14)
        plt.ylim(0, max(cluster_means.values.max() * 1.15, 10))  # Expand y-axis a bit more for labels
        plt.xticks(bar_positions, cluster_profile.index, rotation=45)
        plt.tight_layout()
    
    plt.subplots_adjust(hspace=0.4, wspace=0.3)
    plt.show()
    
    return df_clustered, cluster_means

df_clustered, cluster_means = analyze_clusters(df, labels, optimal_k)

In [None]:
def display_cluster_examples(df_clustered, image_dir="../thumbnails", num_examples=10):
    """
    Display example images from each cluster with max 5 images per row (without color bars)
    
    Parameters:
    -----------
    df_clustered : pandas DataFrame
        DataFrame containing image data with Cluster column
    image_dir : str
        Directory path where the images are stored
    num_examples : int
        Number of example images to show per cluster
    """
    for cluster_id in sorted(df_clustered['Cluster'].unique()):
        cluster_samples = df_clustered[df_clustered['Cluster'] == cluster_id]
        samples = cluster_samples.sample(min(num_examples, len(cluster_samples)))
        
        print(f"Cluster {cluster_id} examples (showing {len(samples)} out of {len(cluster_samples)}):")
        
        # Limit to 5 images per row
        images_per_row = 5
        n_samples = len(samples)
        n_rows = math.ceil(n_samples / images_per_row)
        
        # Create figure with only image rows (no color bars)
        fig, axs = plt.subplots(n_rows, images_per_row, 
                               figsize=(15, 3.5 * n_rows))
        fig.suptitle(f'Cluster {cluster_id} (Contains {len(cluster_samples)} images)', fontsize=16)
        
        # Handle single row case
        if n_rows == 1:
            axs = np.array([axs])
        
        # Process each sample
        for i, (_, row) in enumerate(samples.iterrows()):
            # Calculate position in the grid (no multiplier needed since no color bars)
            row_idx = i // images_per_row
            col_idx = i % images_per_row
            
            # Get the correct axes object
            img_ax = axs[row_idx, col_idx]
            
            image_path = os.path.join(image_dir, row['Name'])
            
            # Try to load and display image
            try:
                img = Image.open(image_path)
                img_ax.imshow(img)
                img_ax.set_title(row['Name'], fontsize=10)
                img_ax.axis('off')
                
                # Extract color data for console output only
                color_data = {col: row[col] for col in row.index 
                             if col not in ['Name', 'Cluster']}
                
                # Sort colors by percentage
                sorted_colors = sorted(color_data.items(), key=lambda x: x[1], reverse=True)
                
                # Print detailed information
                print(f"  - {row['Name']}: ", end="")
                significant_colors = {k: f"{v:.1f}%" for k, v in sorted_colors if v > 5.0}
                print(significant_colors)
                
            except Exception as e:
                img_ax.text(0.5, 0.5, f"Image not found\n{row['Name']}", 
                          horizontalalignment='center', verticalalignment='center')
                img_ax.axis('off')
                print(f"  - Error loading {row['Name']}: {e}")
        
        # Hide any unused subplots
        for i in range(n_samples, n_rows * images_per_row):
            row_idx = i // images_per_row
            col_idx = i % images_per_row
            axs[row_idx, col_idx].axis('off')
            
        plt.tight_layout()
        plt.subplots_adjust(top=0.9, hspace=0.2)
        plt.show()
        print()

# To use this function, just specify your image directory:
display_cluster_examples(df_clustered, image_dir="../thumbnails")

In [None]:
# Analyze cluster sizes and distribution
cluster_counts = df_clustered['Cluster'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
bars = plt.bar(cluster_counts.index, cluster_counts.values)

# Color each bar according to dominant color in that cluster
for i, bar in enumerate(bars):
    dominant_color = cluster_means.iloc[i].idxmax()
    if dominant_color.lower() in ['yellow', 'orange', 'red', 'violet', 'blue', 'green', 'brown', 'black', 'white']:
        bar.set_color(dominant_color.lower())
    else:
        bar.set_color('gray')

plt.title('Number of Images per Cluster')
plt.xlabel('Cluster ID')
plt.ylabel('Number of Images')
plt.xticks(cluster_counts.index)
plt.grid(axis='y', alpha=0.3)

# Add count labels above bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
             f'{height}', ha='center', va='bottom')

plt.show()

In [None]:
# Find the most distinctive colors for each cluster
# Define numeric_cols (color columns) by selecting numeric columns excluding 'Cluster'
numeric_cols = df_clustered.select_dtypes(include=['number']).columns
numeric_cols = [col for col in numeric_cols if col != 'Cluster']

all_means = df_clustered[numeric_cols].mean()
distinctiveness = pd.DataFrame(index=cluster_means.index, columns=cluster_means.columns)

for cluster in cluster_means.index:
    for color in cluster_means.columns:
        # How much higher/lower is this color's percentage in this cluster vs. overall?
        distinctiveness.loc[cluster, color] = cluster_means.loc[cluster, color] / all_means[color]

# Display most distinctive colors (those with highest ratio compared to overall average)
for cluster in distinctiveness.index:
    distinct_colors = distinctiveness.loc[cluster].sort_values(ascending=False)
    print(f"\nCluster {cluster} distinctive colors:")
    for color, ratio in distinct_colors.items():
        if ratio > 1.2:  # Only show colors that appear at least 20% more than average
            print(f"  - {color}: {ratio:.2f}x the average ({cluster_means.loc[cluster, color]:.1f}% vs overall {all_means[color]:.1f}%)")

In [None]:
# Analyze variation within clusters
# Calculate standard deviation of each color within each cluster
cluster_stds = df_clustered.groupby('Cluster')[numeric_cols].std()

# Plot the standard deviations as a heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(cluster_stds, annot=True, fmt='.1f', cmap='viridis')
plt.title('Color Variation Within Each Cluster (Standard Deviation)')
plt.tight_layout()
plt.show()

# Find the most homogeneous and heterogeneous clusters
mean_stds = cluster_stds.mean(axis=1).sort_values()
print("\nClusters from most homogeneous to most heterogeneous:")
for cluster, std in mean_stds.items():
    print(f"Cluster {cluster}: Average std dev = {std:.2f}")

In [None]:
def find_representative_images(df_clustered, X_scaled, kmeans, n=10, images_per_row=5):
    """Find and display the most representative images in each cluster (closest to centroid)"""
    representatives = {}
    
    # Determine number of clusters and calculate cluster sizes
    n_clusters = len(kmeans.cluster_centers_)
    cluster_sizes = df_clustered['Cluster'].value_counts().sort_index()
    
    # Generate a distinct color for each cluster
    cluster_colors = plt.cm.tab10(np.linspace(0, 1, n_clusters))
    
    # Process each cluster separately
    for cluster_id in range(n_clusters):
        # Get indices of images in this cluster
        in_cluster = df_clustered['Cluster'] == cluster_id
        cluster_size = sum(in_cluster)
        
        if cluster_size == 0:
            continue
            
        # Calculate distances to centroid
        cluster_points = X_scaled[in_cluster]
        centroid = kmeans.cluster_centers_[cluster_id].reshape(1, -1)
        distances = euclidean_distances(cluster_points, centroid)
        
        # Get indices of points closest to centroid
        closest_indices = np.argsort(distances.flatten())[:n]
        
        # Translate to original DataFrame indices
        original_indices = np.where(in_cluster)[0][closest_indices]
        representative_images = df_clustered.iloc[original_indices]
        
        representatives[cluster_id] = representative_images
        
        # Create a new figure for this cluster
        rows = int(np.ceil(min(n, cluster_size) / images_per_row))
        plt.figure(figsize=(images_per_row * 3, rows * 3 + 1))
        
        # Add cluster title with statistics
        plt.suptitle(f"Cluster {cluster_id}: {cluster_size} images", 
                    fontsize=16, y=0.98)
        
        # Display representative images
        for i, (idx, img) in enumerate(representative_images.iterrows()):
            if i >= n:
                break
                
            ax = plt.subplot(rows, images_per_row, i + 1)
            
            # Add colored border based on cluster
            ax.spines['bottom'].set_color(cluster_colors[cluster_id])
            ax.spines['top'].set_color(cluster_colors[cluster_id]) 
            ax.spines['right'].set_color(cluster_colors[cluster_id])
            ax.spines['left'].set_color(cluster_colors[cluster_id])
            ax.spines['bottom'].set_linewidth(5)
            ax.spines['top'].set_linewidth(5) 
            ax.spines['right'].set_linewidth(5)
            ax.spines['left'].set_linewidth(5)
            
            # Clean up filename for display
            filename = img['Name']
            short_name = filename[:15] + '...' if len(filename) > 15 else filename
            
            # Load and display the image
            image_path = f"../thumbnails/{filename}"
            try:
                image = plt.imread(image_path)
                plt.imshow(image)
                plt.title(short_name, fontsize=10)
                plt.axis('off')
            except Exception as e:
                plt.text(0.5, 0.5, f"Missing image", ha='center', va='center')
                plt.gca().set_facecolor('#f0f0f0')  # Light gray background for missing images
                
        plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust for suptitle
        plt.show()
    
    return representatives

# Call with the new parameter for controlling images per row
representative_images = find_representative_images(df_clustered, X_scaled, kmeans, n=10, images_per_row=5)

In [None]:
# Create a cluster comparison plot (radar chart)
def plot_cluster_comparison(cluster_means):
    """Create radar charts to compare clusters"""
    # Normalize the data for radar chart
    normalized_means = cluster_means.div(cluster_means.max(axis=1), axis=0)
    
    # Set up the figure
    fig = plt.figure(figsize=(15, 12))
    
    # Plot 4 clusters per row
    clusters_per_row = 4
    rows = math.ceil(len(cluster_means) / clusters_per_row)
    
    # Categories for the radar chart
    categories = cluster_means.columns
    N = len(categories)
    
    # Create angle for each category
    angles = np.linspace(0, 2*np.pi, N, endpoint=False).tolist()
    angles += angles[:1]  # Close the loop
    
    for i, cluster in enumerate(normalized_means.index):
        ax = fig.add_subplot(rows, clusters_per_row, i+1, polar=True)
        
        # Values for this cluster, add first value at end to close the loop
        values = normalized_means.loc[cluster].values.flatten().tolist()
        values += values[:1]
        
        # Plot and fill
        ax.plot(angles, values, linewidth=2, label=f"Cluster {cluster}")
        ax.fill(angles, values, alpha=0.25)
        
        # Set category labels
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(categories, size=8)
        
        # Set title
        ax.set_title(f"Cluster {cluster}", size=11)
        
        # Remove radial labels
        ax.set_yticklabels([])
        
    plt.tight_layout()
    plt.show()

plot_cluster_comparison(cluster_means)

In [None]:
# Analyze which colors contribute most to principal components
def analyze_pca_components(X_scaled, feature_names, pca):
    """Analyze which features (colors) contribute most to each principal component"""
    # Get loadings (feature contributions to components)
    loadings = pca.components_
    
    plt.figure(figsize=(12, 8))
    for i, component in enumerate(loadings[:2]):  # Analyze two components
        plt.subplot(3, 1, i+1)
        
        # Sort contributions by absolute value
        sorted_idx = np.argsort(np.abs(component))[::-1]
        sorted_features = [feature_names[j] for j in sorted_idx]
        sorted_loadings = component[sorted_idx]
        
        # Create horizontal bar chart
        bars = plt.barh(np.arange(len(sorted_features)), sorted_loadings)
        
        # Color bars by feature name
        for j, (bar, feature) in enumerate(zip(bars, sorted_features)):
            if feature.lower() in ['yellow', 'orange', 'red', 'violet', 'blue', 'green', 'brown', 'black', 'white']:
                bar.set_color(feature.lower())
            else:
                bar.set_color('gray')
        
        plt.axvline(x=0, color='gray', linestyle='--')
        plt.yticks(np.arange(len(sorted_features)), sorted_features)
        plt.title(f'Feature Contributions to PC{i+1} (Explains {pca.explained_variance_ratio_[i]:.1%} of variance)')
        plt.tight_layout()
    
    plt.show()

# Run the analysis (assuming you have X_scaled, feature_names, and pca from previous analysis)
feature_names = numeric_cols  # Your color column names
analyze_pca_components(X_scaled, feature_names, pca)

In [None]:
# ! EXPERIMENTAL CODE (NOT SURE IF THIS MAKES SENSE)

def calculate_color_metrics(df):
    """Calculate contrast and saturation metrics for each image based on color distribution"""
    # Drop the 'Name' column for calculations
    colors_only = df.drop('Name', axis=1)
    
    # Define color properties (approximate values)
    color_properties = {
        'Yellow': {'luminance': 0.8, 'saturation': 0.8},
        'Orange': {'luminance': 0.6, 'saturation': 0.9},
        'Red': {'luminance': 0.5, 'saturation': 1.0},
        'Violet': {'luminance': 0.4, 'saturation': 0.7},
        'Blue': {'luminance': 0.3, 'saturation': 0.8},
        'Green': {'luminance': 0.5, 'saturation': 0.7},
        'Brown': {'luminance': 0.3, 'saturation': 0.4},
        'Black': {'luminance': 0.0, 'saturation': 0.0},
        'White': {'luminance': 1.0, 'saturation': 0.0}
    }
    
    # Initialize result dataframe
    result = pd.DataFrame(index=df.index)
    result['Name'] = df['Name']
    
    # Calculate metrics for each image
    for idx, row in colors_only.iterrows():
        # Contrast: standard deviation of color percentages
        result.at[idx, 'color_variance'] = row.std()
        
        # Weighted luminance contrast
        luminance_values = [color_properties[color]['luminance'] * row[color]/100 
                           for color in color_properties.keys() if color in row.index]
        if luminance_values:
            result.at[idx, 'luminance_contrast'] = max(luminance_values) - min(luminance_values)
        else:
            result.at[idx, 'luminance_contrast'] = 0
            
        # Weighted average saturation
        weighted_saturation = sum(color_properties[color]['saturation'] * row[color]/100 
                                 for color in color_properties.keys() if color in row.index)
        result.at[idx, 'avg_saturation'] = weighted_saturation
        
    return result

def visualize_color_metrics(df, metrics_df):
    """Visualize relationships between color distributions and calculated metrics"""
    # Create a new figure
    plt.figure(figsize=(15, 10))
    
    # 1. Dominant color vs metrics
    plt.subplot(2, 2, 1)
    colors_only = df.drop('Name', axis=1)
    dominant_colors = colors_only.idxmax(axis=1)
    
    sns.boxplot(x=dominant_colors, y=metrics_df['avg_saturation'])
    plt.title('Saturation by Dominant Color')
    plt.xticks(rotation=45)
    
    # 2. Contrast vs Saturation scatter plot
    plt.subplot(2, 2, 2)
    sns.scatterplot(x='avg_saturation', y='luminance_contrast', data=metrics_df)
    plt.title('Contrast vs Saturation')
    
    # 3. Distribution of metrics
    plt.subplot(2, 2, 3)
    sns.histplot(metrics_df['avg_saturation'], kde=True)
    plt.title('Distribution of Saturation Values')
    
    plt.subplot(2, 2, 4)
    sns.histplot(metrics_df['luminance_contrast'], kde=True)
    plt.title('Distribution of Contrast Values')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate correlations between color percentages and metrics
    print("Correlations between color percentages and metrics:")
    corr_data = pd.concat([colors_only, metrics_df[['avg_saturation', 'luminance_contrast', 'color_variance']]], axis=1)
    correlation_matrix = corr_data.corr()
    
    # Show correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title('Correlation between Colors and Metrics')
    plt.tight_layout()
    plt.show()

# Calculate metrics and visualize
metrics_df = calculate_color_metrics(df)
visualize_color_metrics(df, metrics_df)

In [None]:
# 9. Save the results (optional)
# df_clustered.to_csv('clustered_thumbnails.csv', index=False)