# Einlesen der Testdatensätze und Anpassung der Daten

In [1]:
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import AffinityPropagation
from sklearn.metrics import adjusted_rand_score, pairwise_distances
import numpy as np
import warnings
from sklearn import metrics
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

df_test = pd.read_csv('test_data.csv')

# Writing a function to map the St 48 and St 37 variants to just St 48 and St 37

def map_variants(dataframe, column_name):
    # Mapping logic: If the value in the specified column contains 'St 48' or 'St 37', 
    # map it to just 'St 48' or 'St 37' respectively.
    dataframe[column_name] = dataframe[column_name].apply(lambda x: 'St 48' if '48' in x else ('St 37' if '37' in x else x))
    return dataframe

# Apply the function to the 'ID 1' column of the dataframe
data_mapped = map_variants(df_test, 'ID 1')

pd.set_option('display.max_rows', 500)
print(df_test['ID 1'].value_counts())

features = df_test.drop(['ID 1', 'Grade'], axis=1)
scaler = FunctionTransformer(np.log1p, validate=True)
df_scaled = scaler.fit_transform(features)
df_scaled = pd.DataFrame(df_scaled)
#pd.DataFrame(-pairwise_distances(df_scaled,metric='l1').round(2)).mode(axis=1).min().min()

ID 1
St 48    229
St 37     24
StSi       9
Name: count, dtype: int64


# Affinity Propagation Algorithmus mit verschiedenen Evaluationsmetriken

Algorithmus kann auch mit PCA ausgeführt werden, jedoch hat sich bei den unten aufgeführten Testdurchläufen ergeben, dass ohne PCA in den meisten Fällen bessere Ergebnisse erzielt werden

In [30]:
def affinity_propagation(data):    
    #pca = PCA(n_components=3) 
    #df_pca = pca.fit_transform(data)

    labels_true = df_test['ID 1'].astype('category').cat.codes
    algorithm = AffinityPropagation(damping=0.6, preference=0.6*np.min(-pairwise_distances(data, metric='l1')))
    algorithm.fit(data)
    labels = algorithm.labels_
    #data['Cluster'] = labels
    #data['true'] = df_test['ID 1']
    #return data.iloc[:,23:25]
    n_clusters = len(np.unique(labels))
    print("Estimated number of clusters: %d" % n_clusters)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
    print(
    "Adjusted Mutual Information: %0.3f"
    % metrics.adjusted_mutual_info_score(labels_true, labels)
    )
    print(
    "Silhouette Coefficient: %0.3f"
    % metrics.silhouette_score(df_scaled, labels, metric="sqeuclidean")
    )

In [31]:
affinity_propagation(df_scaled)

Estimated number of clusters: 3
Homogeneity: 0.773
Completeness: 0.612
V-measure: 0.683
Adjusted Rand Index: 0.722
Adjusted Mutual Information: 0.678
Silhouette Coefficient: 0.610


Da diese Optimierung von mir nur auf den zwei vorhandenen Datensätzen probiert wurde und die Datensätze nicht sehr umfangreich sind, ist hier natürlich die Gefahr für Overfitting hoch. Wenn neue Datensätze dazukommen, muss man entsprechend anpassen.

In [8]:
def optimize_affinity_propagation_advanced(data):
    true_labels = df_test['ID 1'].astype('category').cat.codes

    # List of metrics to try
    metrics = ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock', 'chebyshev', 'hamming', 'matching', 'minkowski', 'nan_euclidean']
    best_score = 0
    best_metric = ""
    best_n_clusters = 0
    best_damping = 0
    best_preference = 0

    # Iterate over metrics, damping factors, and preference values
    for metric in metrics:
        for damping in np.arange(0.5, 0.7, 0.01):
            for preference_scale in np.arange(0.5, 1, 0.02):
                try:
                    # Calculate preference based on the metric
                    preference = preference_scale * np.min(-pairwise_distances(data, metric=metric))

                    # Apply Affinity Propagation
                    algorithm = AffinityPropagation(damping=damping, preference=preference, max_iter=200, convergence_iter=15)
                    algorithm.fit(data)
                    labels = algorithm.labels_
                    n_clusters = len(np.unique(labels))

                    # Calculate ARI score
                    ari_score = adjusted_rand_score(true_labels, labels)

                    # Update best parameters if current ARI score is better
                    if ari_score > best_score:
                        best_score = ari_score
                        best_metric = metric
                        best_n_clusters = n_clusters
                        best_damping = damping
                        best_preference = preference
                        best_scale = preference_scale

                except Exception as e:
                    print(f"Error with metric {metric}, damping {damping}, preference scale {preference_scale}: {e}")

    return best_metric, best_score, best_n_clusters, best_damping, best_preference, best_scale

best_metric, best_score, best_n_clusters, best_damping, best_preference, best_scale = optimize_affinity_propagation_advanced(df_scaled)
best_metric, best_score, best_n_clusters, best_damping, best_preference, best_scale



('l1',
 0.7224176034057344,
 3,
 0.6000000000000001,
 -0.8936924343096005,
 0.6000000000000001)