# Melodate



In [1]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.layers import Input
from tensorflow.keras.optimizers import Adam

In [2]:
# Load dataset
dataset_path = 'merged_dataset.csv'
data = pd.read_csv(dataset_path)
data.head()

Unnamed: 0,user,age,status,gender,drinks,height,smokes,religion,genre,music_decade,music_vibe,listening_frequency,concert
0,user0,31,single,female,yes,168.0,no,hinduism,K-Pop,2020s,Relaxing and Chill,Frequently,"No, I prefer not to attend concerts"
1,user1,50,single,female,yes,175.0,no,buddhism,K-Pop,2020s,Upbeat and Energetic,Frequently,"Sometimes, depending on the artist or event"
2,user2,25,single,male,yes,188.0,no,confucianism,Traditional & Folk Music,2000s,Relaxing and Chill,Frequently,"No, I prefer not to attend concerts"
3,user3,39,single,male,yes,175.0,no,hinduism,Pop,2010s,Relaxing and Chill,Frequently,"No, I prefer not to attend concerts"
4,user4,22,single,male,yes,170.0,yes,buddhism,Indie/Alternative,1980s,Upbeat and Energetic,Frequently,"Sometimes, depending on the artist or event"


In [3]:
# Selecting categorical columns for one-hot encoding
categorical_cols = ['status', 'gender', 'drinks', 'smokes', 'religion', 'genre', 'music_decade', 'music_vibe', 'listening_frequency', 'concert']

# Fit and transform the categorical data
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(data[categorical_cols])

# Create a DataFrame with the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# Merge the encoded features with the numeric features
numeric_cols = ['age', 'height']
final_dataset = pd.concat([data[numeric_cols], encoded_df], axis=1)

final_dataset.head()


Unnamed: 0,age,height,status_in a relationship,status_single,gender_female,gender_male,drinks_no,drinks_yes,smokes_no,smokes_yes,...,music_vibe_Romantic and Smooth,music_vibe_Upbeat and Energetic,listening_frequency_Frequently,listening_frequency_Never,listening_frequency_Occasionally,listening_frequency_Only in specific situations,listening_frequency_Rarely,"concert_No, I prefer not to attend concerts","concert_Sometimes, depending on the artist or event","concert_Yes, I love attending concerts"
0,31,168.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,50,175.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,25,188.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,39,175.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,22,170.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
# Normalize numeric features
scaler = MinMaxScaler()
final_dataset[['age', 'height']] = scaler.fit_transform(final_dataset[['age', 'height']])

# Split the data into training and testing sets
X_train, X_test = train_test_split(final_dataset, test_size=0.2, random_state=42)

X_train.head()

Unnamed: 0,age,height,status_in a relationship,status_single,gender_female,gender_male,drinks_no,drinks_yes,smokes_no,smokes_yes,...,music_vibe_Romantic and Smooth,music_vibe_Upbeat and Energetic,listening_frequency_Frequently,listening_frequency_Never,listening_frequency_Occasionally,listening_frequency_Only in specific situations,listening_frequency_Rarely,"concert_No, I prefer not to attend concerts","concert_Sometimes, depending on the artist or event","concert_Yes, I love attending concerts"
232,0.06383,0.543478,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
59,0.531915,0.434783,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6,0.212766,0.543478,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
185,0.276596,0.217391,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
173,0.319149,0.543478,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
# Define model architecture

input_tensor = Input(shape=(X_train.shape[1],))

model = Sequential([
    input_tensor,
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(X_train.shape[1], activation='sigmoid')  # Output layer size matches the input feature size
])

model.summary()

In [6]:
# Compile model
model.compile(optimizer=Adam(learning_rate=0.01), loss='mse')

In [7]:
# Train model
history = model.fit(X_train, X_train, epochs=30, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 66ms/step - loss: 0.1941 - val_loss: 0.0952
Epoch 2/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0956 - val_loss: 0.0910
Epoch 3/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0857 - val_loss: 0.0812
Epoch 4/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.0773 - val_loss: 0.0753
Epoch 5/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.0696 - val_loss: 0.0680
Epoch 6/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 0.0619 - val_loss: 0.0629
Epoch 7/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0561 - val_loss: 0.0570
Epoch 8/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.0503 - val_loss: 0.0546
Epoch 9/30
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [8]:
# Evaluate model
loss = model.evaluate(X_test, X_test)
print('Test Loss:', loss)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0369 
Test Loss: 0.03712114319205284


In [10]:
# Save model
model.save('melodate_model.keras')

#Predict with simulate user input

In [11]:
new_user_input = [{
    'age': 21,
    'status': 'single',
    'gender': 'male',
    'drinks': 'yes',
    'height': 172.0,
    'smokes': 'yes',
    'religion': 'roman catholicism',
    'genre': 'edm',
    'music_decade': '2000s',
    'music_vibe': 'Upbeat and Energetic',
    'listening_frequency': 'Only in specific situations',
    'concert': 'No, I prefer not to attend concerts'
}]

new_user_df = pd.DataFrame(new_user_input)

In [12]:
new_user_encoded = encoder.transform(new_user_df[categorical_cols])
new_user_encoded_df = pd.DataFrame(new_user_encoded, columns=encoder.get_feature_names_out(categorical_cols))

In [15]:
new_user_df[['age', 'height']] = scaler.transform(new_user_df[['age', 'height']])

final_new_user_df = pd.concat([new_user_df[['age', 'height']], new_user_encoded_df], axis=1)

In [16]:
# Predict and Sort Top 10 Matches
predictions = model.predict(final_new_user_df)
top_10_indices = predictions.argsort()[0][-10:][::-1]  # Gets indices of top 10 values

# Output top 10 probabilities
top_10_probabilities = predictions[0][top_10_indices]
print(top_10_indices, top_10_probabilities)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step
[ 7  3  4 67 61  9 10  0 65  8] [1.         1.         0.98063123 0.97168356 0.96285254 0.95313233
 0.2775125  0.24776824 0.21278547 0.14437816]


In [18]:
recommended_users = data.iloc[top_10_indices]
print(recommended_users)

      user  age  status  gender drinks  height smokes           religion  \
7    user7   30  single    male    yes   178.0     no  roman catholicism   
3    user3   39  single    male    yes   175.0     no           hinduism   
4    user4   22  single    male    yes   170.0    yes           buddhism   
67  user67   32  single  female    yes   163.0    yes       confucianism   
61  user61   27  single    male    yes   183.0     no       confucianism   
9    user9   51  single  female    yes   180.0     no  roman catholicism   
10  user10   27  single    male    yes   185.0     no           hinduism   
0    user0   31  single  female    yes   168.0     no           hinduism   
65  user65   29  single    male    yes   173.0     no  roman catholicism   
8    user8   34  single    male    yes   183.0     no           hinduism   

                       genre music_decade            music_vibe  \
7                        Pop        2000s    Emotional and Deep   
3                        Pop 