In [32]:
# Function to calculate mean
def mean(values):
    return sum(values) / len(values) if values else None

# Function to calculate variance
def variance(values):
    m = mean(values)
    return sum((x - m) ** 2 for x in values) / len(values) if values else None

# Function to calculate standard deviation
def standard_deviation(values):
    return variance(values) ** 0.5 if values else None

# Function to calculate minimum
def minimum(values):
    return min(values) if values else None

# Function to calculate maximum
def maximum(values):
    return max(values) if values else None

# Load data function
def load_data(path, field_order, data):
    with open(path, 'r', encoding='utf-8') as file:
        header = next(file)
        for line in file:
            parts = []
            buffer = ""
            inside_quotes = False
            for char in line.strip():
                if char == '"':
                    inside_quotes = not inside_quotes
                elif char == ',' and not inside_quotes:
                    parts.append(buffer)
                    buffer = ""
                else:
                    buffer += char
            parts.append(buffer)

            for i, field in enumerate(field_order):
                value = parts[i]
                try:
                    if field in ['tempo', 'valence', 'popularity', 'acousticness', 'danceability', 'energy',
                                 'instrumentalness', 'liveness', 'speechiness', 'loudness', 'duration_ms']:
                        value = float(value)  # Treat as float for numerical fields
                    elif field in ['year', 'key', 'explicit', 'mode']:
                        value = int(value)  # Treat as integer for specific fields
                    elif field in ['artists', 'name', 'genres']:
                        value = value.strip('"').strip("[]").split(', ')
                        value = [v.strip("'") for v in value]
                    elif field == 'release_date':
                        value = value.strip('"')
                except ValueError:
                    # Handle invalid data gracefully
                    value = None  # Assign None for invalid or missing values

                data[field].append(value)

    return data

# Function to compute similarity score between two music tracks
def similarity_between_tracks(track1, track2, method):
    if method == 'euclidean':
        return sum((a - b) ** 2 for a, b in zip(track1, track2)) ** 0.5
    elif method == 'manhattan':
        return sum(abs(a - b) for a, b in zip(track1, track2))
    elif method == 'cosine':
        dot_product = sum(a * b for a, b in zip(track1, track2))
        magnitude1 = sum(a ** 2 for a in track1) ** 0.5
        magnitude2 = sum(b ** 2 for b in track2) ** 0.5
        return dot_product / (magnitude1 * magnitude2) if magnitude1 and magnitude2 else None
    elif method == 'jaccard':
        intersection = len(set(track1) & set(track2))
        union = len(set(track1) | set(track2))
        return intersection / union if union else None
    elif method == 'pearson':
        mean1 = mean(track1)
        mean2 = mean(track2)
        numerator = sum((a - mean1) * (b - mean2) for a, b in zip(track1, track2))
        denominator = (sum((a - mean1) ** 2 for a in track1) ** 0.5) * (sum((b - mean2) ** 2 for b in track2) ** 0.5)
        return numerator / denominator if denominator else None

# Function to compute similarity score between two artists
def similarity_between_artists(artist1, artist2, method, data):
    features1 = [data[feature] for feature in artist1]
    features2 = [data[feature] for feature in artist2]
    return similarity_between_tracks(features1, features2, method)

# Query for genre with the best liveliness
def query_best_liveliness(data, genre_data):
    genre_liveliness = {}
    for i, genre in enumerate(genre_data['genres']):
        liveliness = data['liveness'][i]
        if genre:
            for g in genre:
                if g not in genre_liveliness:
                    genre_liveliness[g] = []
                genre_liveliness[g].append(liveliness)

    genre_mean_liveliness = {g: mean(liveliness) for g, liveliness in genre_liveliness.items()}
    best_genre = max(genre_mean_liveliness, key=genre_mean_liveliness.get)
    return best_genre, genre_mean_liveliness[best_genre]

# Query for artist loudness
def query_artist_loudness(data, artist_data, query_type='highest'):
    artist_loudness = {}
    for i, artist in enumerate(artist_data['artists']):
        loudness = data['loudness'][i]
        for a in artist:
            if a not in artist_loudness:
                artist_loudness[a] = []
            artist_loudness[a].append(loudness)

    artist_mean_loudness = {a: mean(loudness) for a, loudness in artist_loudness.items()}
    if query_type == 'highest':
        artist = max(artist_mean_loudness, key=artist_mean_loudness.get)
        loudness = artist_mean_loudness[artist]
    else:
        artist = min(artist_mean_loudness, key=artist_mean_loudness.get)
        loudness = artist_mean_loudness[artist]

    return artist, loudness

# Define field orders and data structures
field_order = ['valence', 'year', 'acousticness', 'artists', 'danceability', 'duration_ms', 'energy',
               'explicit', 'id', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'name',
               'popularity', 'release_date', 'speechiness', 'tempo']

data = {field: [] for field in field_order}

field_order2 = ['mode', 'genres', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness',
                'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'popularity', 'key']

data_genres = {field: [] for field in field_order2}

# Load datasets
data = load_data('./data.csv', field_order, data)
data_genres = load_data('./data_genres.csv', field_order2, data_genres)


#Example






Euclidean Similarity: 0.17320508075688776
Manhattan Similarity: 0.30000000000000004
Cosine Similarity: 0.983992916902926
Jaccard Similarity: 0.0
Pearson Similarity: 0.9332565252573827
