In [15]:
import numpy as np
import pandas as pd 
import PIL as pil
import cv2
from sklearn import cluster
import seaborn as sn 
import os
import re
import matplotlib.pyplot as plt

import time
import math
import colorsys

In [16]:
def import_file_names(hashtags, image_directories, image_name_files, platform):
    df_filenames = pd.DataFrame(columns=["id", "platform", "hashtag", "filepath"])
    if platform == "instagram":
        #code for extracting a unique identifier from instagram urls.
        #will get replaced once I have downloaded the updated data.
        uuid_pattern = re.compile(r"https://www.instagram.com/\w*/(.*)/")
        for hashtag in hashtags:
            df_hashtag_filenames = pd.read_csv(image_name_files[hashtag], delimiter=",")
            for idx, row in df_hashtag_filenames.iterrows():
                filepath = image_directories[hashtag] + str(row["Id"])
                uuid = row["Id"]
                #print(idx,uuid, filepath)
                df_filenames = pd.concat([pd.DataFrame([[uuid, platform, hashtag, filepath]], columns=df_filenames.columns), df_filenames])
    if platform == "tiktok":

        for hashtag in hashtags:
            df_hashtag_filenames = pd.read_csv(image_name_files[hashtag], delimiter=",")
            for idx, row in df_hashtag_filenames.iterrows():
                uuid = row["id"]
                filepath = image_directories[hashtag] + str(uuid) + ".jpeg"
                #print(idx,uuid, filepath)
                df_filenames = pd.concat([pd.DataFrame([[uuid, platform, hashtag, filepath]], columns=df_filenames.columns), df_filenames])
            

    return df_filenames


            

            


**Für Instagram-Daten**

In [17]:
# reading in file paths
hashtags = ["climatechange", "climatecrisis", "savetheplanet", "klimakrise", "klimaschutz"]
image_directories = {
    "climatechange":"DATA\INSTA_DATA_COMPLETE\climatechange_images_names\\",
    "climatecrisis":"DATA\INSTA_DATA_COMPLETE\climatecrisis_images_names\\",
    "savetheplanet":"DATA\INSTA_DATA_COMPLETE\savetheplanet_images_names\\",
    "klimakrise":"DATA\INSTA_DATA_COMPLETE\klimakrise_images_names\\",
    "klimawandel":"DATA\INSTA_DATA_COMPLETE\klimawandel_images_names\\",
    "klimaschutz":"DATA\INSTA_DATA_COMPLETE\klimaschutz_images_names\\"
}

#CSVs where the filenames of images are saved
image_name_files = {
    "climatechange":"DATA\INSTA_DATA_COMPLETE\climatechange.csv",
    "climatecrisis":"DATA\INSTA_DATA_COMPLETE\climatecrisis.csv",
    "savetheplanet":"DATA\INSTA_DATA_COMPLETE\savetheplanet.csv",
    "klimakrise":"DATA\INSTA_DATA_COMPLETE\klimakrise.csv",
    "klimawandel":"DATA\INSTA_DATA_COMPLETE\klimawandel.csv",
    "klimaschutz":"DATA\INSTA_DATA_COMPLETE\klimaschutz.csv"

}

df_image_filepaths_instagram = import_file_names(hashtags, image_directories, image_name_files, "instagram")

**Für Tiktok-Daten**

In [18]:
#reading in file paths
hashtags = ["climatechange", "climatecrisis", "savetheplanet", "klimakrise", "klimaschutz"]
image_directories = {
    "climatechange":"DATA\Daten TikTok gemerged\TikTok Bilder gemerged\climatechange\climatechange Bilder Annika und Philipp\\",
    "climatecrisis":"DATA\Daten TikTok gemerged\TikTok Bilder gemerged\climatecrisis\climatecrisis Bilder philipp und Annika\\",
    "savetheplanet":"DATA\Daten TikTok gemerged\TikTok Bilder gemerged\savetheplanet\savetheplanet Bilder Annika und Philipp\\",
    "klimakrise":"DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimakrise\Klimakrise Bilder philipp und annika\\",
    "klimawandel":"DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimawandel\klimawandel Bilder Philipp und Annika\\",
    "klimaschutz":"DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimaschutz\klimaschutz Bilder Philipp und Annika\\"
}

image_name_files = {
    "climatechange":"DATA\Daten TikTok gemerged\Tiktok csv gemerged\climatechange\column-filter-4c19b73d2271ac697cb14e7085f182dc.csv",
    "climatecrisis":"DATA\Daten TikTok gemerged\Tiktok csv gemerged\climatecrisis\column-filter-ac7e6b48b705b231a6d0677de468caa0(1).csv",
    "savetheplanet":"DATA\Daten TikTok gemerged\Tiktok csv gemerged\savetheplanet\column-filter-26348f885172ba823820df8ab3e1a1de.csv",
    "klimakrise":"DATA\Daten TikTok gemerged\Tiktok csv gemerged\klimakrise\column-filter-a9851a7839f6530d133fe62f26b09c55.csv",
    "klimawandel":"DATA\Daten TikTok gemerged\Tiktok csv gemerged\klimawandel\column-filter-1b666f03e9a3d630c2e42d4c735f5100.csv",
    "klimaschutz":"DATA\Daten TikTok gemerged\Tiktok csv gemerged\klimaschutz\column-filter-9c0da9f64cf3bc4f77a724d098048b66.csv"

}

df_image_filepaths_tiktok = import_file_names(hashtags, image_directories, image_name_files, "tiktok")

In [19]:
df_image_filepaths = pd.concat([df_image_filepaths_instagram, df_image_filepaths_tiktok])
#only for testing purposes. Remove before processing the data for real.
#df_image_filepaths = df_image_filepaths.sample(25)
df_image_filepaths

Unnamed: 0,id,platform,hashtag,filepath
0,klimaschutz_700.jpg,instagram,klimaschutz,DATA\INSTA_DATA_COMPLETE\klimaschutz_images_na...
0,klimaschutz_699.jpg,instagram,klimaschutz,DATA\INSTA_DATA_COMPLETE\klimaschutz_images_na...
0,klimaschutz_698.jpg,instagram,klimaschutz,DATA\INSTA_DATA_COMPLETE\klimaschutz_images_na...
0,klimaschutz_697.jpg,instagram,klimaschutz,DATA\INSTA_DATA_COMPLETE\klimaschutz_images_na...
0,klimaschutz_696.jpg,instagram,klimaschutz,DATA\INSTA_DATA_COMPLETE\klimaschutz_images_na...
...,...,...,...,...
0,7252904683222895918,tiktok,climatechange,DATA\Daten TikTok gemerged\TikTok Bilder gemer...
0,7291671021827116334,tiktok,climatechange,DATA\Daten TikTok gemerged\TikTok Bilder gemer...
0,7224978138751618305,tiktok,climatechange,DATA\Daten TikTok gemerged\TikTok Bilder gemer...
0,7256810993311812907,tiktok,climatechange,DATA\Daten TikTok gemerged\TikTok Bilder gemer...


In [20]:
df_clusters = pd.DataFrame(columns = ["id", "platform", "hashtag", "r", "g", "b", "h", "s", "v", "relative_frequency"])

In [21]:
# constants
eps = 8 #for DBSCAN
max_eps = 18 #for OPTICS
num_colors = 5#for Kmeans
min_samples = 128

color_mode = "hsv"
#color_mode = "rgb"

visualize = False

do_remove_noise = False

scale_factor = 0.12
thumbnail_size = (128,128)

distance_thresh = 84 # for Agglomerative Clustering

In [22]:
def distance_hsv_colors(x,y):
    dist_hue = min(pow(x[0] - y[0], 2), pow((x[0]+128) % 256 - (y[0]+128) % 256, 2))
    dist_sat = pow(x[1] - y[1], 2)
    dist_val = pow(x[2] - y[2], 2)

    return math.sqrt(dist_hue + dist_sat + dist_val)

### Clustering

In [23]:

total_num_rows = df_image_filepaths.shape[0]
num_rows = 0
for idx, row in df_image_filepaths.iterrows():
    num_rows+=1
    filepath = row["filepath"]
    try:
        with pil.Image.open(filepath) as img:
            #img = img.resize((math.floor(img.size[0]*scale_factor), math.floor(img.size[1]*scale_factor)))
            img.thumbnail(thumbnail_size)
            if color_mode == "hsv":
                img = img.convert('HSV')
            img_pixels = np.asarray(img).reshape(img.size[0]*img.size[1], 3)

            #DIFFERENT CLUSTERING ALGORITHMS - uncomment as needed.

            # K-MEANS: Simple, but limited. Only accepts a set number of clusters. 
            #model = cluster.KMeans(n_clusters=num_colors, n_init="auto")

            # DBSCAN: Good results and quite fast. Biggest drawback is its memory use (can be combatted by reducing image resolution.)
            model = cluster.DBSCAN(eps=eps,min_samples=min_samples)#, metric=distance_hsv_colors)

            #OPTICS: Similar to DBSCAN without the memory requirements, but very slow.
            #model = cluster.OPTICS(min_samples=min_samples, max_eps=max_eps)

            #Agglomerative: Returns clusters of relatively equal size, but difficult to control and slow.
            #model = cluster.AgglomerativeClustering(n_clusters=None, distance_threshold=distance_thresh, linkage='average')


            labels = model.fit_predict(img_pixels) 
            #print(labels)
            #model = cluster.BisectingKMeans(num_colors).fit(img_pixels)
            #centroids = model.cluster_centers_
            unique_labels, counts = np.unique(labels, return_counts=True)
            centroids = np.empty((unique_labels.shape[0], 3))
            for i, label in enumerate(unique_labels):
                #centroids[i] = img_pixels[labels == label].mean(axis=0)
                centroids[i] = np.median(img_pixels[labels == label], axis=0)
            
            if -1 in unique_labels and do_remove_noise:
                centroids = centroids[1:]
            #print(f"CENTROIDS: {centroids}")

            counts = counts / labels.shape[0]
            #print(counts, unique_labels)

            #visualization stuff
            if visualize:
                pie_colors = []
                for i, clr in enumerate(centroids):
                    pie_color = tuple(clr/255)
                    if color_mode == "hsv":
                        pie_color = colorsys.hsv_to_rgb(*pie_color)
                    pie_colors.append(pie_color)

                plt.imshow(img)
                plt.show()
                plt.pie(counts, colors=pie_colors)
                plt.show()
            # end visualization stuff

            for i, color in enumerate(centroids):
                if color_mode == "hsv":
                    h, s, v = tuple(color/255)
                    r, g, b = colorsys.hsv_to_rgb(h, s, v)
                elif color_mode == "rgb":
                    r, g, b = tuple(color/255)
                    h, s, v = colorsys.hsv_to_rgb(r, g, b)
                relative_frequency = counts[i]
                new_row = pd.DataFrame([[row["id"], row["platform"], row["hashtag"], r, g, b, h, s, v, relative_frequency]], columns=df_clusters.columns)
                if visualize:
                    print(new_row)
                df_clusters = pd.concat([new_row, df_clusters])
        
        if not visualize:
            print (f"\r {num_rows}/{total_num_rows}", end="")
    except FileNotFoundError as e:
        print(f"No File at {filepath}")


 3141/5968No File at DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimaschutz\klimaschutz Bilder Philipp und Annika\7115744451003206917.jpeg
 3269/5968No File at DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimaschutz\klimaschutz Bilder Philipp und Annika\7285381694939565344.jpeg
 3272/5968No File at DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimaschutz\klimaschutz Bilder Philipp und Annika\7218976919541288197.jpeg
 3293/5968No File at DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimaschutz\klimaschutz Bilder Philipp und Annika\7268692536632380704.jpeg
 3326/5968No File at DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimaschutz\klimaschutz Bilder Philipp und Annika\7180385660988148998.jpeg
 3348/5968No File at DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimaschutz\klimaschutz Bilder Philipp und Annika\7241781385092943130.jpeg
No File at DATA\Daten TikTok gemerged\TikTok Bilder gemerged\klimaschutz\klimaschutz Bilder Philipp und Annika\7244224754188

In [100]:
print(len(list(df_clusters["id"].unique())))
print(df_clusters)

5809
                     id   platform        hashtag         r         g  \
0   7172368921318657322     tiktok  climatechange  0.956863  0.589127   
0   7172368921318657322     tiktok  climatechange  0.513725  0.301694   
0   7172368921318657322     tiktok  climatechange  0.650980  0.419931   
0   7172368921318657322     tiktok  climatechange  0.876471  0.521636   
0   7172368921318657322     tiktok  climatechange  0.776471  0.133979   
..                  ...        ...            ...       ...       ...   
0   klimaschutz_700.jpg  instagram    klimaschutz  0.455785  0.741176   
0   klimaschutz_700.jpg  instagram    klimaschutz  0.703391  0.776471   
0   klimaschutz_700.jpg  instagram    klimaschutz  0.349865  0.509804   
0   klimaschutz_700.jpg  instagram    klimaschutz  0.000000  0.270588   
0   klimaschutz_700.jpg  instagram    klimaschutz  0.216809  0.415686   

           b         h         s         v  relative_frequency  
0   0.063791  0.098039  0.933333  0.956863           

In [85]:
df_coarse_clusters = pd.DataFrame(columns = ["id", "platform", "hashtag", "r", "g", "b", "h", "s", "v", "relative_frequency"])  

In [25]:
#parameters for mean-shift-clustering

use_hsv = True

bin_seeding = True
bandwidth = None
if use_hsv:
    bandwidth = cluster.estimate_bandwidth(df_clusters[["h", "s", "v"]])
else:
    bandwidth = cluster.estimate_bandwidth(df_clusters[["r", "g", "b"]])

bandwidth = bandwidth /2


### Meta-Clustering: Clustern der Cluster, um ähnliche Farben zusammenzubringen

In [None]:
#performs mean-shift clustering on the different colors found in the first step

for id in df_clusters["id"].unique():
    print(id)
    colors_in_image = df_clusters.loc[df_clusters["id"]==id]

    color_values = None
    if use_hsv:
        color_values = colors_in_image[["h", "s", "v"]]
    else:
        color_values = colors_in_image[["r", "g", "b"]]
    
    model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=bin_seeding)
    labels = model.fit_predict(color_values)
    unique_labels = np.unique(labels)
    
    print(labels)
    centroids = np.empty((unique_labels.shape[0], 3))
    relative_frequencies = np.empty((unique_labels.shape[0]))
    for i, label in enumerate(unique_labels):
        #centroids[i] = img_pixels[labels == label].mean(axis=0)
        centroids[i] = np.median(color_values[labels == label], axis=0)
        rows_with_label = colors_in_image[labels == label]
    
    #visualization stuff
    pie_colors = []
    for i, clr in enumerate(centroids):
        pie_color = tuple(clr)
        if use_hsv:
            pie_color = colorsys.hsv_to_rgb(*pie_color)
        pie_colors.append(pie_color)

    plt.pie(relative_frequencies, colors=pie_colors)
    plt.show()
    # end visualization stuff
    #print(labels)
    #print(model.cluster_centers_)
    print(f"RELATIVE FREQUENCIES:\n{relative_frequencies}")
    print(f"CENTROIDS:\n{centroids}")
    print("\n")
    


In [14]:
output_path = "fine_clusters.csv"
df_clusters.to_csv(output_path, sep=";")