# **Music Recommendation System**

# **Mounting Google Drive**

In [6]:
from google.colab import drive
drive.mount ('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Dataset**

In [7]:
TRAIN_PATH = "/content/drive/MyDrive/SpotifyDataset/TrainData"
TEST_PATH = "/content/drive/MyDrive/SpotifyDataset/TestData"

# **Importing Libraries** **bold text**

In [8]:
import numpy as np
#Pandas is one of the tools in Machine Learning which is used for data cleaning and analysis. It has features which are used for exploring, cleaning, transforming and visualizing from data.
import pandas as pd
#creating a NumPy array and calculating its mean using the “np” shorthand for concise code.
import numpy as np
#Seaborn is a library that uses Matplotlib underneath to plot graphs. It will be used to visualize random distributions.
import seaborn as sns
#*os* and *os.path* modules include many functions to interact with the file system.
import os

# **Data Preprocessing**

In [9]:
# make sure the file is already there
print(os.listdir(TRAIN_PATH))

['data.csv', 'data_by_artist.csv', 'data_by_year.csv', 'data_w_genres.csv', 'data_by_genres.csv', 'ex.csv']


In [10]:
#To access data from the CSV file, we require a function read_csv() from Pandas that retrieves data in the form of the data frame.
df =pd.read_csv("/content/drive/MyDrive/SpotifyDataset/TrainData/data.csv")
#(show the dataset)df is a variable that holds the reference file to the pandas DataFrame
df

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.98200,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878000,10,0.6650,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.9630,1921,0.73200,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.000000,7,0.1600,-12.441,1,Clancy Lowered the Boom,5,1921,0.4150,60.936
2,0.0394,1921,0.96100,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913000,3,0.1010,-14.850,1,Gati Bali,5,1921,0.0339,110.339
3,0.1650,1921,0.96700,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,0.000028,5,0.3810,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.2530,1921,0.95700,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,0.000002,3,0.2290,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.0380,101.665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170648,0.6080,2020,0.08460,"['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna...",0.786,301714,0.808,0,0KkIkfsLEJbrcIhYsCL7L5,0.000289,7,0.0822,-3.702,1,China,72,2020-05-29,0.0881,105.029
170649,0.7340,2020,0.20600,['Ashnikko'],0.717,150654,0.753,0,0OStKKAuXlxA0fMH54Qs6E,0.000000,7,0.1010,-6.020,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936
170650,0.6370,2020,0.10100,['MAMAMOO'],0.634,211280,0.858,0,4BZXVFYCb76Q0Klojq4piV,0.000009,4,0.2580,-2.226,0,AYA,76,2020-11-03,0.0809,91.688
170651,0.1950,2020,0.00998,['Eminem'],0.671,337147,0.623,1,5SiZJoLXp3WOl3J4C8IK0d,0.000008,2,0.6430,-7.161,1,Darkness,70,2020-01-17,0.3080,75.055


#**Data Checking**

In [11]:
#Check the empty values
df.isnull()
df.isnull().sum()

valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
dtype: int64

In [12]:
#boolen output
df.isnull().any()

valence             False
year                False
acousticness        False
artists             False
danceability        False
duration_ms         False
energy              False
explicit            False
id                  False
instrumentalness    False
key                 False
liveness            False
loudness            False
mode                False
name                False
popularity          False
release_date        False
speechiness         False
tempo               False
dtype: bool

# **Data Handling**

In [13]:
missing_value =["N/A","na",np.nan]
df = pd.read_csv("/content/drive/MyDrive/SpotifyDataset/TrainData/data.csv", na_values=missing_value)
df.isnull().sum()

valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
dtype: int64

#**Data Visualization**

In [None]:
sns.heatmap(df.isnull(),yticklabels=False, annot=True)

# **Data Filtering**

In [14]:
#The dropna() method removes the rows that contains NULL values.
#The dropna() method returns a new DataFrame object unless the inplace parameter is set to True, in that case the dropna() method does the removing in the original DataFrame instead
df.dropna(inplace=True)

In [15]:
df.duplicated().sum()
#if there is a duplication values
#and want to remove
df=df.drop_duplicates()

# **Data Analysis**

In [16]:
# display the first 4 rows
df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [17]:
#Shape of the current data frame shape (row,column)
df.shape

(170653, 19)

In [18]:
#read and display the user-rating
df=pd.read_csv(r"/content/drive/MyDrive/SpotifyDataset/TrainData/ex.csv")
df['User-Rating']

0       8.8/10
1       9.0/10
2       9.7/10
3       9.1/10
4       9.2/10
         ...  
2415    6.2/10
2416    7.2/10
2417    7.5/10
2418    6.5/10
2419    6.6/10
Name: User-Rating, Length: 2420, dtype: object

# **Deployement of Model (Model Building)**

In [19]:
# Initialize an empty list
# Iterating over a column User-Rating in a DataFrame
# For each element i in the 'User-Rating' column, you are appending a substring i[:3] to user_list
user_list=[]
for i in df['User-Rating']:
  user_list.append(i[:3])
user_list

['8.8',
 '9.0',
 '9.7',
 '9.1',
 '9.2',
 '9.1',
 '9.2',
 '9.5',
 '9.2',
 '9.6',
 '8.8',
 '8.4',
 '9.1',
 '9.2',
 '9.0',
 '9.7',
 '9.1',
 '9.0',
 '9.0',
 '9.1',
 '9.2',
 '9.2',
 '9.5',
 '8.6',
 '9.1',
 '9.0',
 '9.0',
 '9.2',
 '9.5',
 '9.4',
 '8.7',
 '9.3',
 '9.3',
 '9.5',
 '9.1',
 '9.0',
 '9.0',
 '9.1',
 '8.4',
 '8.4',
 '8.8',
 '9.2',
 '9.2',
 '9.6',
 '9.6',
 '9.0',
 '9.1',
 '8.5',
 '9.5',
 '8.5',
 '9.7',
 '8.9',
 '9.6',
 '8.5',
 '9.1',
 '8.7',
 '8.6',
 '9.3',
 '9.7',
 '8.9',
 '9.0',
 '9.0',
 '9.3',
 '9.3',
 '9.1',
 '9.0',
 '9.7',
 '9.2',
 '9.1',
 '9.0',
 '8.9',
 '9.5',
 '9.5',
 '9.7',
 '9.0',
 '8.1',
 '8.1',
 '9.5',
 '9.1',
 '8.6',
 '9.4',
 '9.0',
 '9.0',
 '9.6',
 '9.3',
 '9.0',
 '9.2',
 '9.3',
 '9.1',
 '8.6',
 '9.5',
 '9.5',
 '9.0',
 '9.4',
 '9.2',
 '9.6',
 '9.3',
 '7.7',
 '9.2',
 '9.3',
 '7.7',
 '9.0',
 '9.5',
 '9.4',
 '9.0',
 '9.2',
 '9.0',
 '9.4',
 '9.4',
 '9.1',
 '9.3',
 '9.2',
 '9.1',
 '9.0',
 '9.8',
 '8.9',
 '6.8',
 '8.9',
 '8.2',
 '8.7',
 '8.2',
 '9.4',
 '9.0',
 '8.9',
 '9.9',


In [20]:
df['User-Rating']=user_list
df

Unnamed: 0,Song-Name,Singer/Artists,Genre,Album/Movie,User-Rating
0,Aankh Marey,"Kumar Sanu, Mika Singh, Neha Kakkar",BollywoodDance,Simmba,8.8
1,Coca Cola,"Neha Kakkar, Tony Kakkar",BollywoodDanceRomantic,Luka Chuppi,9.0
2,Apna Time Aayega,Ranveer Singh,BollywoodDance,Gully Boy,9.7
3,Mungda,"Jyotica Tangri, Shaan, Subhro Ganguly",BollywoodDance,Total Dhamaal,9.1
4,Tere Bin,"Asees Kaur, Rahat Fateh Ali Khan, Tanishk Bagchi",BollywoodRomantic,Simmba,9.2
...,...,...,...,...,...
2415,Jana Tumhare Pyar Mein,Mukesh,BollywoodDance,Sasural,6.2
2416,Tum Jaise Bigde Babu Se,Lata Mangeshkar,BollywoodDance,Jab Pyar Kisi Se Hota Hai,7.2
2417,O Yaad Nahi Bhool Gaya,"Lata Mangeshkar, Suresh Wadkar",BollywoodDance,Lamhe,7.5
2418,Ladi Re Ladi Tujhse Aankh Jo Ladi,Jagjit Kaur,BollywoodDance,Shola Aur Shabnam,6.5


In [21]:
#presenting the data in good way
df['Album/Movie'] = df['Album/Movie'].str.replace(' ','')
df['Singer/Artists'] = df['Singer/Artists'].str.replace(' ','')
df

Unnamed: 0,Song-Name,Singer/Artists,Genre,Album/Movie,User-Rating
0,Aankh Marey,"KumarSanu,MikaSingh,NehaKakkar",BollywoodDance,Simmba,8.8
1,Coca Cola,"NehaKakkar,TonyKakkar",BollywoodDanceRomantic,LukaChuppi,9.0
2,Apna Time Aayega,RanveerSingh,BollywoodDance,GullyBoy,9.7
3,Mungda,"JyoticaTangri,Shaan,SubhroGanguly",BollywoodDance,TotalDhamaal,9.1
4,Tere Bin,"AseesKaur,RahatFatehAliKhan,TanishkBagchi",BollywoodRomantic,Simmba,9.2
...,...,...,...,...,...
2415,Jana Tumhare Pyar Mein,Mukesh,BollywoodDance,Sasural,6.2
2416,Tum Jaise Bigde Babu Se,LataMangeshkar,BollywoodDance,JabPyarKisiSeHotaHai,7.2
2417,O Yaad Nahi Bhool Gaya,"LataMangeshkar,SureshWadkar",BollywoodDance,Lamhe,7.5
2418,Ladi Re Ladi Tujhse Aankh Jo Ladi,JagjitKaur,BollywoodDance,SholaAurShabnam,6.5


In [22]:
#put all rows in one variable as i can loop over it
df['tags']=df['Singer/Artists']+' '+df['Genre']+' '+df['Album/Movie']+' '+df['User-Rating']
df['tags'][0]

'KumarSanu,MikaSingh,NehaKakkar BollywoodDance Simmba 8.8'

# **Data Preparation**

In [23]:
new_df=df[['Song-Name','tags']]
new_df

Unnamed: 0,Song-Name,tags
0,Aankh Marey,"KumarSanu,MikaSingh,NehaKakkar BollywoodDance ..."
1,Coca Cola,"NehaKakkar,TonyKakkar BollywoodDanceRomantic L..."
2,Apna Time Aayega,RanveerSingh BollywoodDance GullyBoy 9.7
3,Mungda,"JyoticaTangri,Shaan,SubhroGanguly BollywoodDan..."
4,Tere Bin,"AseesKaur,RahatFatehAliKhan,TanishkBagchi Boll..."
...,...,...
2415,Jana Tumhare Pyar Mein,Mukesh BollywoodDance Sasural 6.2
2416,Tum Jaise Bigde Babu Se,LataMangeshkar BollywoodDance JabPyarKisiSeHot...
2417,O Yaad Nahi Bhool Gaya,"LataMangeshkar,SureshWadkar BollywoodDance Lam..."
2418,Ladi Re Ladi Tujhse Aankh Jo Ladi,JagjitKaur BollywoodDance SholaAurShabnam 6.5


In [24]:
new_df['tags'] = new_df['tags'].astype(str)
new_df['tags']= new_df['tags'].apply(lambda x:x.lower())
new_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(lambda x:x.lower())


Unnamed: 0,Song-Name,tags
0,Aankh Marey,"kumarsanu,mikasingh,nehakakkar bollywooddance ..."
1,Coca Cola,"nehakakkar,tonykakkar bollywooddanceromantic l..."
2,Apna Time Aayega,ranveersingh bollywooddance gullyboy 9.7
3,Mungda,"jyoticatangri,shaan,subhroganguly bollywooddan..."
4,Tere Bin,"aseeskaur,rahatfatehalikhan,tanishkbagchi boll..."
...,...,...
2415,Jana Tumhare Pyar Mein,mukesh bollywooddance sasural 6.2
2416,Tum Jaise Bigde Babu Se,latamangeshkar bollywooddance jabpyarkisisehot...
2417,O Yaad Nahi Bhool Gaya,"latamangeshkar,sureshwadkar bollywooddance lam..."
2418,Ladi Re Ladi Tujhse Aankh Jo Ladi,jagjitkaur bollywooddance sholaaurshabnam 6.5


# **Importing the CountVectorizer**

In [25]:
#Utilities to extract features from text data.
#Converts text data into a "bag of words" representation. In this representation, the text is transformed into a matrix where each row corresponds to a document and each column corresponds to a word (or token). The value in each cell represents the count of the word in the corresponding document.
#plotting confusion metrics
from sklearn.feature_extraction.text import CountVectorizer
#Create an instance of CountVectorizer with max_features=2000
#Reducing the dimensionality of the feature space
cv=CountVectorizer(max_features=2000)

# **Fitting and Transforming the Data**

In [26]:
#Learns the vocabulary from the text data in the tags column and then transforms the text data into a document-term matrix a sparse matrix where each row corresponds to a document and each column corresponds to a term from the vocabulary, with the values representing term frequencies
vectors = cv.fit_transform(new_df['tags']).toarray()

In [27]:
#There are 2420 documents and 1698 unique terms (features) in the resulting matrix
vectors.shape

(2420, 1698)

In [28]:
!pip install --upgrade scikit-learn
feature_names = cv.get_feature_names_out()



In [29]:
# Ensure that the cell defining `similarity` is executed first
!pip install scikit-learn==1.1.3
!pip install --upgrade scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
# Learns the vocabulary from the text data in the tags column and then transforms the text data into a document-term matrix a sparse matrix where each row corresponds to a document and each column corresponds to a term from the vocabulary, with the values representing term frequencies
vectors = cv.fit_transform(new_df['tags']).toarray()
print(vectors)
similarity = cosine_similarity(vectors)

Collecting scikit-learn==1.1.3
  Using cached scikit_learn-1.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.0
    Uninstalling scikit-learn-1.5.0:
      Successfully uninstalled scikit-learn-1.5.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.6.0 requires scikit-learn>=1.2.2, but you have scikit-learn 1.1.3 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.1.3


Collecting scikit-learn
  Using cached scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.1.3
    Uninstalling scikit-learn-1.1.3:
      Successfully uninstalled scikit-learn-1.1.3
Successfully installed scikit-learn-1.5.0
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [30]:
similarity = cosine_similarity(vectors)
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])

[(0, 0.9999999999999999),
 (5, 0.5163977794943223),
 (34, 0.5163977794943223),
 (184, 0.5163977794943223),
 (271, 0.5163977794943223),
 (375, 0.5163977794943223),
 (469, 0.5163977794943223),
 (544, 0.5163977794943223),
 (591, 0.5163977794943223),
 (610, 0.5163977794943223),
 (825, 0.5163977794943223),
 (925, 0.5163977794943223),
 (1406, 0.5163977794943223),
 (1620, 0.5163977794943223),
 (1955, 0.5163977794943223),
 (2173, 0.5163977794943223),
 (2300, 0.5163977794943223),
 (2387, 0.5163977794943223),
 (11, 0.4472135954999579),
 (35, 0.4472135954999579),
 (40, 0.4472135954999579),
 (88, 0.4472135954999579),
 (115, 0.4472135954999579),
 (117, 0.4472135954999579),
 (136, 0.4472135954999579),
 (142, 0.4472135954999579),
 (242, 0.4472135954999579),
 (243, 0.4472135954999579),
 (264, 0.4472135954999579),
 (304, 0.4472135954999579),
 (311, 0.4472135954999579),
 (345, 0.4472135954999579),
 (366, 0.4472135954999579),
 (379, 0.4472135954999579),
 (390, 0.4472135954999579),
 (410, 0.44721359549995

In [31]:
new_df.rename(columns={'Song-Name':'title'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.rename(columns={'Song-Name':'title'},inplace=True)


In [32]:
def recommend(music):
    music_index=new_df[new_df['title']==music].index[0]
    distances=similarity[music_index]
    music_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in music_list:
        print(new_df.iloc[i[0]].title)

In [34]:
recommend('Proper Patola')

Abhi Toh Party Shuru Hui Hai
Bhare Bazaar
Tareefan Remix
Pant Mein Gun
Pee Paa Pee Paa


In [35]:
df.head(50)

Unnamed: 0,Song-Name,Singer/Artists,Genre,Album/Movie,User-Rating,tags
0,Aankh Marey,"KumarSanu,MikaSingh,NehaKakkar",BollywoodDance,Simmba,8.8,"KumarSanu,MikaSingh,NehaKakkar BollywoodDance ..."
1,Coca Cola,"NehaKakkar,TonyKakkar",BollywoodDanceRomantic,LukaChuppi,9.0,"NehaKakkar,TonyKakkar BollywoodDanceRomantic L..."
2,Apna Time Aayega,RanveerSingh,BollywoodDance,GullyBoy,9.7,RanveerSingh BollywoodDance GullyBoy 9.7
3,Mungda,"JyoticaTangri,Shaan,SubhroGanguly",BollywoodDance,TotalDhamaal,9.1,"JyoticaTangri,Shaan,SubhroGanguly BollywoodDan..."
4,Tere Bin,"AseesKaur,RahatFatehAliKhan,TanishkBagchi",BollywoodRomantic,Simmba,9.2,"AseesKaur,RahatFatehAliKhan,TanishkBagchi Boll..."
5,Gali Gali,NehaKakkar,BollywoodDance,KGF,9.1,NehaKakkar BollywoodDance KGF 9.1
6,Chamma Chamma,"Arun,Ikka,NehaKakkar,Romy",BollywoodDance,FraudSaiyaan,9.2,"Arun,Ikka,NehaKakkar,Romy BollywoodDance Fraud..."
7,Mere Gully Mein,RanveerSingh,BollywoodDance,GullyBoy,9.5,RanveerSingh BollywoodDance GullyBoy 9.5
8,Kamariya,DarshanRaval,BollywoodDance,Mitron,9.2,DarshanRaval BollywoodDance Mitron 9.2
9,Ek Ladki Ko Dekha Toh Aisa Laga,"DarshanRaval,RochakKohli",BollywoodRomantic,EkLadkiKoDekhaTohAisaLaga,9.6,"DarshanRaval,RochakKohli BollywoodRomantic EkL..."


In [39]:
import pickle
pickle.dump(new_df,open('musicrec.pkl','wb'))

In [38]:
pickle.dump(similarity,open('similarities.pkl','wb'))