Import library part
___

In [None]:
import math
from graphviz import Graph
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import squareform

In [None]:
# Loading data
data = pd.read_csv('./dataset.csv')

# The logic is inspired by what we've seen during the class and the code available in code/metrics/hybrid_data
# We init proximity score for each categories. If it's approximately the same the score increase else it decrease.
musics = {
    0: "other",
    0.1: "classical",
    0.2: "jazz",
    3: "hiphop",
    3.1: "trap",
    3.2: "rap",
    4.2: "rock",
    4.3: "metal",
    4.4: "technical death metal"
}

citys = {
    0: "lille",
    0.25: "paris",
    0.5: "toulouse",
    0.75: "marseille",
    6: "madrid",
}

jobs = {
    0: "doctor",
    2: "teacher",
    5: "fireman",
    7: "painter",
    10: "designer",
    11: "developper",
    12: "engineer"
}

# Define numerical and categorical features
numerical_features = ['age', 'height']
categorical_features = ['job', 'city', 'favorite music style']

# Define column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Fit and transform the data
X_processed = preprocessor.fit_transform(data)

def dissimilarity_cat(cat_x1, cat_x2, data, indexColomn):
    key1 = -1
    key2 = -1
    res = 0
    for key, value in data.items():
        if cat_x1.iloc[indexColomn] in value:
            key1 = key
        if cat_x2.iloc[indexColomn] in value:
            key2 = key
    if (key1 > key2):
        res = key1 - key2
    else:
        res = key2 - key1
    if res >= 3:
        return 6
    else:
        return res
    

def custom_dissimilarity(x1, x2):
    # Get the features 1 and 2 which is numerical feature
    num_x1 = x1[1:2]
    num_x2 = x2[1:2]
    
    # Get the features 3, 4 et 5 which stat features
    cat_x1 = x1[3:].astype(str)
    cat_x2 = x2[3:].astype(str)
    
    # Get the euclidienne distance for the numerical feature
    num_distance = np.linalg.norm(num_x1 - num_x2)
    # Limit distance to 6 to not get to big value
    if (num_distance > 6):
        num_distance = 6
    
    # Get the distance for stat features 
    music_distance = dissimilarity_cat(cat_x1, cat_x2, musics, 2)
    city_distance = dissimilarity_cat(cat_x1, cat_x2, citys, 1)
    job_distance = dissimilarity_cat(cat_x1, cat_x2, jobs, 0)

    # Init weight
    num_weight = 5
    music_weight = 4
    city_weight = 5
    job_weight = 1
    
    # 2 city have a distance of 2 if they are in the same country.
    # 2 musics have a distance of 2 if they are in the same musical style.
    if (0 < city_distance) & (city_distance < 1):
        city_distance = 2
    if (0 < music_distance) & (music_distance < 1):
        music_distance = 2
    
    # Combine distances with the weight.
    dissimilarity = math.sqrt(
        (num_weight * num_distance)
        + music_distance * music_weight
        + city_distance * city_weight
        + job_distance * job_weight
    )
    print("\n\n--------compare-----------")
    print(x1)
    print("------------with------------ ")
    print(x2)
    print("----------distance--------- ")
    print("num: ")
    print(num_distance)
    print("music: ")
    print(music_distance)
    print("city: ")
    print(city_distance)
    print("job: ")
    print(job_distance)
    print("--------dissimilarity------- ")
    print(dissimilarity)
    return dissimilarity

# Init matrix of dissimilarity
num_samples = len(data.index)
dissimilarity_matrix = np.zeros((num_samples, num_samples))

for i in range(num_samples):
    for j in range(num_samples):
        dissimilarity_matrix[i, j] = custom_dissimilarity(data.iloc[i], data.iloc[j])

# Calculate the mean and standard deviation of the dissimilarity matrix.
mean_dissimilarity = np.mean(dissimilarity_matrix)
std_dissimilarity = np.std(dissimilarity_matrix)

# Save the matrix in a npy file
np.save('dissimilarity_matrix.npy', dissimilarity_matrix)

# Display the mean and standard deviation.
print("Mean Dissimilarity:", mean_dissimilarity)
print("Standard Deviation of Dissimilarity:", std_dissimilarity)
print ("Loading file ...")
loaded_dissimilarity_matrix = np.load('dissimilarity_matrix.npy')
print(loaded_dissimilarity_matrix)


In [None]:
# Limit to add a dissimilarity link between 2 features
threshold = 8.7

dot = Graph(comment="Graph created from complex data", strict=True)
for index in range(num_samples):
    # iloc[5] is the column which "favorite music style" is saved.
    fav_music = data.loc[index].iloc[5]
    dot.node(fav_music)

# Compare samples between them
for sample_1_id in range(num_samples):
    for sample_2_id in range(num_samples):
        # We avoid comparing a sample with itself
        if not sample_1_id == sample_2_id:
            player_1_name = data.loc[sample_1_id].iloc[5]
            player_2_name = data.loc[sample_2_id].iloc[5]
            if dissimilarity_matrix[sample_1_id, sample_2_id] > threshold:
                # A line representing the dissimilarity is plotted on the graph.
                dot.edge(
                    player_1_name,
                    player_2_name,
                    color="darkolivegreen4",
                    penwidth="1.1",
                )

# display the graph
dot.attr(label=f"threshold {threshold}", fontsize="20")
graph_name = f"images/complex_data_threshold_{threshold}"
dot.render(graph_name)
