In [1]:
#First we need to load our datasets and see how they are look like
import pandas as pd
import numpy as np

#we are using .dat file which is seperated by tab
#so we use '\t' as delimiter
user_data = pd.read_csv('/Users/user/Downloads/hetrec2011-lastfm-2k/user_artists.dat', delimiter='\t')
product_data = pd.read_csv('/Users/user/Downloads/hetrec2011-lastfm-2k/artists.dat', delimiter='\t')

In [2]:
#let's have a look our datasets to understand
#structure of data

user_data.head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


So, we have *userID*, *artistID* which identifies artist and *weight* which defines the count of the artist listened by user. Maybe we need something different like *rating* instead of *weight*. We will decide later

In [3]:
#let's get some information about the dataframe
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92834 entries, 0 to 92833
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   userID    92834 non-null  int64
 1   artistID  92834 non-null  int64
 2   weight    92834 non-null  int64
dtypes: int64(3)
memory usage: 2.1 MB


It looks clean, no missing values i think.

In [4]:
#now it's turn to product_data
product_data.head()

Unnamed: 0,id,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


So, we have four columns. And I think the most important ones for our task are *id* and *name*.

In [5]:
product_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17632 entries, 0 to 17631
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          17632 non-null  int64 
 1   name        17632 non-null  object
 2   url         17632 non-null  object
 3   pictureURL  17188 non-null  object
dtypes: int64(1), object(3)
memory usage: 551.1+ KB


No null values in important ones.

In [6]:
#So, I think it is time to next steps
#Let's create a function to load data incase we need in the future
def load_data(user_data_file, product_data_file):
    
    #we are using .dat file which is seperated by tab
    #so we use '\t' as delimiter
    user_data = pd.read_csv(user_data_file, delimiter='\t')
    product_data = pd.read_csv(product_data_file, delimiter='\t')
    
    # Here we create 'rating' column which might be more appropriate
    user_data['rating'] = user_data['weight']*5/max(user_data['weight'])
    
    return user_data, product_data

In [8]:
#Let's try to create collaborative filtering and
#test its performance
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from surprise.dataset import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD

#Loading data
user_data, product_data = load_data('/Users/user/Downloads/hetrec2011-lastfm-2k/user_artists.dat', '/Users/user/Downloads/hetrec2011-lastfm-2k/artists.dat')

#Splitting data into train and test sets so that we can accuracy of predictions
train_set, test_set = train_test_split(user_data, test_size=0.2, random_state=42)


# Collaborative filtering module
def collaborative_filtering(train_data):
    reader = Reader(rating_scale=(1, 5)) 
    data = Dataset.load_from_df(train_data[['userID', 'artistID', 'rating']], reader)
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    
    return algo

#Training the module
collab_model = collaborative_filtering(train_set)

#Testing and showing results
predictions = []
actual_ratings = []
for _, row in test_set.iterrows():
    user_id = row['userID']
    item_id = row['artistID']
    rating = row['rating']
    pred = collab_model.predict(user_id, item_id)
    predictions.append(pred.est)
    actual_ratings.append(rating)
rmse = mean_squared_error(actual_ratings, predictions, squared=False)
mae = mean_absolute_error(actual_ratings, predictions)


print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 0.9906
MAE: 0.9901


In [9]:
from scipy.sparse import csr_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from surprise import SVD, Dataset, Reader
from sklearn.metrics.pairwise import cosine_similarity

# Content-based filtering module with deep learning
def content_based_filtering(product_data):
    # Text preprocessing
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(product_data['name'])
    sequences = tokenizer.texts_to_sequences(product_data['name'])
    max_length = max([len(seq) for seq in sequences])
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    
    # Deep learning model for product representation
    embedding_dim = 128
    vocab_size = len(tokenizer.word_index) + 1
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(product_data.shape[0], activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    model.fit(padded_sequences, np.eye(product_data.shape[0]), epochs=10, batch_size=32)
    
    # Compute product similarity matrix
    product_representations = model.predict(padded_sequences)
    product_similarity_matrix = cosine_similarity(product_representations)
    
    return product_similarity_matrix

# Hybrid recommendation system
def hybrid_recommendation(user_id, user_data, product_data, collab_model, content_model, alpha=0.5):
    user_interactions = user_data[user_data['userID'] == user_id]
    
    # Collaborative filtering predictions
    collab_predictions = [collab_model.predict(user_id, product_id)[3] for product_id in product_data['id']]
    
    # Content-based filtering predictions
    user_liked_products = user_interactions[user_interactions['rating'] > 3]['artistID']
    user_liked_products_indices = [product_data[product_data['id'] == product_id].index[0] for product_id in user_liked_products]
    content_predictions = content_model[user_liked_products_indices].sum(axis=0) / len(user_liked_products_indices)
    
    # Hybrid recommendations
    hybrid_predictions = alpha * np.array(collab_predictions) + (1 - alpha) * np.array(content_predictions)
    hybrid_recommendations = product_data.iloc[hybrid_predictions.argsort()[::-1]]
    
    return hybrid_recommendations

# Example usage
user_data, product_data = load_data('/Users/user/Downloads/hetrec2011-lastfm-2k/user_artists.dat', '/Users/user/Downloads/hetrec2011-lastfm-2k/artists.dat')
collab_model = collaborative_filtering(user_data)
content_model = content_based_filtering(product_data)

user_id = 123  # Example user ID
hybrid_recommendations = hybrid_recommendation(user_id, user_data, product_data, collab_model, content_model)
print(hybrid_recommendations)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
          id                             name  \
17631  18745                 Grzegorz Tomczak   
5879    6004                       Mr. Scruff   
5873    5998  Madonna feat. Justin Timberlake   
5874    5999                           Zombie   
5875    6000               LanzamientosMp3.es   
...      ...                              ...   
11748  12174                     Leo e Junior   
11747  12173            Gustavo Moura & Rafae   
11746  12172          César Menotti & Fabiano   
11745  12171                  Rodrigo Del Arc   
0          1                     MALICE MIZER   

                                                     url  \
17631          http://www.last.fm/music/Grzegorz+Tomczak   
5879                 http://www.last.fm/music/Mr.+Scruff   
5873   http://www.last.fm/music/Madonna+feat.+Justin+...   
5874                     http://www.last.fm/music/Zombie   


  content_predictions = content_model[user_liked_products_indices].sum(axis=0) / len(user_liked_products_indices)
