In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('merged_dataset.csv')

In [None]:
df.head()

Unnamed: 0,user,age,status,gender,drinks,height,smokes,religion,genre,music_decade,music_vibe,listening_frequency,concert
0,user0,31,single,female,yes,168.0,no,hinduism,K-Pop,2020s,Relaxing and Chill,Frequently,"No, I prefer not to attend concerts"
1,user1,50,single,female,yes,175.0,no,buddhism,K-Pop,2020s,Upbeat and Energetic,Frequently,"Sometimes, depending on the artist or event"
2,user2,25,single,male,yes,188.0,no,confucianism,Traditional & Folk Music,2000s,Relaxing and Chill,Frequently,"No, I prefer not to attend concerts"
3,user3,39,single,male,yes,175.0,no,hinduism,Pop,2010s,Relaxing and Chill,Frequently,"No, I prefer not to attend concerts"
4,user4,22,single,male,yes,170.0,yes,buddhism,Indie/Alternative,1980s,Upbeat and Energetic,Frequently,"Sometimes, depending on the artist or event"


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   user                 300 non-null    object 
 1   age                  300 non-null    int64  
 2   status               300 non-null    object 
 3   gender               300 non-null    object 
 4   drinks               300 non-null    object 
 5   height               300 non-null    float64
 6   smokes               300 non-null    object 
 7   religion             300 non-null    object 
 8   genre                300 non-null    object 
 9   music_decade         300 non-null    object 
 10  music_vibe           300 non-null    object 
 11  listening_frequency  300 non-null    object 
 12  concert              300 non-null    object 
dtypes: float64(1), int64(1), object(11)
memory usage: 30.6+ KB


In [None]:
class MelodateRecommender:
    def __init__(self, data_path):
        self.cat_columns = ['religion', 'smokes', 'drinks', 'genre', 'music_vibe', 'music_decade', 'listening_frequency', 'concert']
        self.num_columns = ['age', 'height']

        self.general_pref = [
            'religion_buddhism', 'religion_confucianism', 'religion_hinduism', 'religion_islam',
            'religion_protestant christianity', 'religion_roman catholicism',
            'age', 'height',
            'smokes_yes', 'smokes_no', 'drinks_yes', 'drinks_no'
        ]

        self.music_pref = [
            'genre_ballad', 'genre_blues', 'genre_classical', 'genre_country', 'genre_dangdut',
            'genre_edm', 'genre_edm (electronic dance music)', 'genre_hip-hop/rap', 'genre_indie/alternative',
            'genre_j-pop', 'genre_jazz', 'genre_k-pop', 'genre_metal', 'genre_pop', 'genre_pop indonesia',
            'genre_punk', 'genre_r&b', 'genre_reggae', 'genre_rock', 'genre_soul', 'genre_traditional & folk music',

            'music_vibe_dark and intense', 'music_vibe_emotional and deep', 'music_vibe_relaxing and chill',
            'music_vibe_romantic and smooth', 'music_vibe_upbeat and energetic',

            'music_decade_1970s', 'music_decade_1980s', 'music_decade_1990s', 'music_decade_2000s',
            'music_decade_2010s', 'music_decade_2020s',

            'listening_frequency_frequently', 'listening_frequency_never', 'listening_frequency_occasionally',
            'listening_frequency_only in specific situations', 'listening_frequency_rarely',

            'concert_no, i prefer not to attend concerts', 'concert_sometimes, depending on the artist or event', 'concert_yes, i love attending concerts'
        ]

        self.data = pd.read_csv(data_path)
        self.preprocessor = None
        self.kmeans_model = None
        self.preprocessed_data = None
        self.user_similarities = None

    def process_all(self, n_clusters=5):
        self.pre_process()
        self.clustering(n_clusters)
        self.cos_similarity()
        return self

    def pre_process(self):
        for col in self.cat_columns:
            self.data[col] = self.data[col].str.lower()

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', MinMaxScaler(), self.num_columns),
                ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), self.cat_columns)
            ])

        preprocessed_data = preprocessor.fit_transform(self.data)

        onehot_categories = preprocessor.named_transformers_['cat'].get_feature_names_out(self.cat_columns)
        feature_names = self.num_columns + list(onehot_categories)

        preprocessed_df = pd.DataFrame(
            preprocessed_data,
            columns=feature_names,
            index=self.data['user']
        )

        self.preprocessor = preprocessor
        self.preprocessed_data = preprocessed_df

        return preprocessed_df

    def clustering(self, n_clusters=5):
        cluster_data = self.preprocessed_data[self.general_pref]

        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        cluster_labels = kmeans.fit_predict(cluster_data)

        self.kmeans_model = kmeans
        self.preprocessed_data['cluster_label'] = cluster_labels

        return cluster_labels

    def cos_similarity(self):
        # Create a dictionary for user similarities
        self.user_similarities = {}

        # Iterate through clusters
        for cluster_label in self.preprocessed_data['cluster_label'].unique():
            # Filter users in the current cluster
            cluster_users = self.preprocessed_data[
                self.preprocessed_data['cluster_label'] == cluster_label
            ]

            music_cluster_data = cluster_users[self.music_pref].values

            # Compute cosine similarities
            similarities = cosine_similarity(music_cluster_data)

            # user_similarities for each user in the cluster
            for idx, user in enumerate(cluster_users.index):
                user_similarities = dict(zip(
                    cluster_users.index,
                    similarities[idx]
                ))

                # Exclude self-similarity and sort by similarity score
                sorted_users = sorted(
                    [(u, score) for u, score in user_similarities.items() if u != user],
                    key=lambda x: x[1],
                    reverse=True
                )

                self.user_similarities[user] = sorted_users

        return self.user_similarities

    def save_model(self, filepath='melodate_recommender.joblib'):
        joblib.dump({
            'preprocessor': self.preprocessor,
            'kmeans_model': self.kmeans_model,
            'preprocessed_data': self.preprocessed_data,
            'user_similarities': self.user_similarities
        }, filepath)
        print(f"Model saved to {filepath}")

In [None]:
recommender = MelodateRecommender('merged_dataset.csv')
recommender.process_all(n_clusters=5)

recommender.save_model('melodate_recommender.joblib')

Model saved to melodate_recommender.joblib


In [None]:
# Specify the username
username = 'user0'

if username in recommender.user_similarities:
    matches = recommender.user_similarities[username]

    # Extract usernames and similarity scores
    match_usernames = [match[0] for match in matches]
    match_scores = [match[1] for match in matches]

    print(f"{username}'s top matches:")
    for user, score in zip(match_usernames, match_scores):
        print(f"- {user}: Similarity score = {score:.4f}")
else:
    print(f"User '{username}' not found in the similarities data.")


user0's top matches:
- user69: Similarity score = 0.8000
- user3: Similarity score = 0.6000
- user6: Similarity score = 0.6000
- user16: Similarity score = 0.6000
- user26: Similarity score = 0.6000
- user151: Similarity score = 0.6000
- user175: Similarity score = 0.6000
- user182: Similarity score = 0.6000
- user205: Similarity score = 0.6000
- user10: Similarity score = 0.4000
- user18: Similarity score = 0.4000
- user28: Similarity score = 0.4000
- user33: Similarity score = 0.4000
- user36: Similarity score = 0.4000
- user42: Similarity score = 0.4000
- user43: Similarity score = 0.4000
- user44: Similarity score = 0.4000
- user51: Similarity score = 0.4000
- user58: Similarity score = 0.4000
- user64: Similarity score = 0.4000
- user70: Similarity score = 0.4000
- user96: Similarity score = 0.4000
- user101: Similarity score = 0.4000
- user109: Similarity score = 0.4000
- user137: Similarity score = 0.4000
- user138: Similarity score = 0.4000
- user145: Similarity score = 0.4000
