# **Музыкальная рекомендательная система**

МПИ-23-1-2, Антонов И. А., Исаченко М. К., Матяш Д. С., Парчиев Р. Б.

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
 
import warnings
warnings.filterwarnings('ignore')

## Данные

In [9]:
data = pd.read_csv('../data/tcc_ceds_music.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [10]:
data.isnull().sum()

Unnamed: 0                  0
artist_name                 0
track_name                  0
release_date                0
genre                       0
lyrics                      0
len                         0
dating                      0
violence                    0
world/life                  0
night/time                  0
shake the audience          0
family/gospel               0
romantic                    0
communication               0
obscene                     0
music                       0
movement/places             0
light/visual perceptions    0
family/spiritual            0
like/girls                  0
sadness                     0
feelings                    0
danceability                0
loudness                    0
acousticness                0
instrumentalness            0
valence                     0
energy                      0
topic                       0
age                         0
dtype: int64

In [14]:
data.drop(columns=['Unnamed: 0', 'lyrics'], inplace=True)

In [15]:
data['track_name'].nunique(), data.shape

(23689, (28372, 29))

In [16]:
data.drop_duplicates(subset=['track_name'], inplace=True)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23689 entries, 0 to 28371
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   artist_name               23689 non-null  object 
 1   track_name                23689 non-null  object 
 2   release_date              23689 non-null  int64  
 3   genre                     23689 non-null  object 
 4   len                       23689 non-null  int64  
 5   dating                    23689 non-null  float64
 6   violence                  23689 non-null  float64
 7   world/life                23689 non-null  float64
 8   night/time                23689 non-null  float64
 9   shake the audience        23689 non-null  float64
 10  family/gospel             23689 non-null  float64
 11  romantic                  23689 non-null  float64
 12  communication             23689 non-null  float64
 13  obscene                   23689 non-null  float64
 14  music      

In [27]:
song_vectorizer = CountVectorizer()
song_vectorizer.fit(data['genre'])
song_vectorizer.fit(data['topic'])

In [28]:
def get_similarities(song_name, data):
   
  # Getting vector for the input song.
  text_array1 = song_vectorizer.transform(data[data['track_name']==song_name]['genre']).toarray()
  num_array1 = data[data['track_name']==song_name].select_dtypes(include=np.number).to_numpy()
   
  # We will store similarity for each row of the dataset.
  sim = []
  for idx, row in data.iterrows():
    name = row['track_name']
     
    # Getting vector for current song.
    text_array2 = song_vectorizer.transform(data[data['track_name']==name]['genre']).toarray()
    num_array2 = data[data['track_name']==name].select_dtypes(include=np.number).to_numpy()
 
    # Calculating similarities for text as well as numeric features
    text_sim = cosine_similarity(text_array1, text_array2)[0][0]
    num_sim = cosine_similarity(num_array1, num_array2)[0][0]
    sim.append(text_sim + num_sim)
     
  return sim

In [29]:
def recommend_songs(song_name, data=data):
  # Base case
  if data[data['track_name'] == song_name].shape[0] == 0:
    print('This song is either not so popular or you\
    have entered invalid_name.\n Some songs you may like:\n')
     
    for song in data.sample(n=5)['track_name'].values:
      print(song)
    return
   
  data['similarity_factor'] = get_similarities(song_name, data)
 
  data.sort_values(by=['similarity_factor'],
                   ascending = [False],
                   inplace=True)
   
  # First song will be the input song itself as the similarity will be highest.
  display(data[['track_name', 'artist_name']][2:7])

In [31]:
recommend_songs("timber i'm falling")

Unnamed: 0,track_name,artist_name
15301,roadworn and weary,supersuckers
6685,one call away,charlie puth
10242,fourteen minutes old,doug stone
8471,heaven's just a sin away,the kendalls
4480,polar opposites,modest mouse
