# Importing the Dependencies

In [None]:
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# LOAD DATASET

In [None]:
# music.zip file downloaded from Kaggle
with zipfile.ZipFile('song_recommendation\data\music.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [None]:
df = pd.read_csv('song_recommendation\data\spotify_millsongdata.csv') 

# Understanding

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Top artists and songs
top_artists = df['artist'].value_counts().head(10)
print("Top 10",top_artists)


In [None]:
#Lấy 28000 bài hát vì dữ liệu quá lớn để tính consine similarity
df = df.sample(28000)

df=df.drop('link', axis=1).reset_index(drop=True)

In [None]:
df.head()

In [None]:
# WordCloud for song lyrics
all_lyrics = " ".join(df['text'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_lyrics)

In [None]:
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud for Song Lyrics", fontsize=16)
plt.show()

# Data Preprocessing


In [None]:
# download nltk data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [None]:
df['cleaned_text'] = df['text'].apply(preprocess_text)

In [None]:
df.head()

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

In [None]:
vector = tfidf_matrix
print(vector.nnz)  

In [None]:
print(tfidf_matrix.shape)

In [None]:
sim_scores = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def recommend_song(input_title,sim_scores=sim_scores, df=df, top_k=5):
    # Lấy chỉ số bài hát đầu vào
    idx = df[df['song'].str.lower() == input_title.lower()].index
    # Kiểm tra input có trong dataset không
    if len(idx) == 0:
        return f"Song '{input_title}' not found in the  trong dataset."
    idx = idx[0]

    # Lấy điểm tương đồng của bài hát đầu vào với tất cả các bài hát khác
    sim_scores = list(enumerate(sim_scores[idx]))

    # Lấy top_k bài hát tương đồng nhất, loại trừ chính nó
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_k+1]

    # Lấy chỉ số bài hát tương đồng nhất
    song_indices = [i[0] for i in sim_scores]
    
    # Trả về danh sách bài hát gợi ý
    recommendations = df[['song', 'artist']].iloc[song_indices]
    return recommendations

In [None]:
df.head()

In [None]:
# Example Recommendation
print("\nRecommendations for the song :")
recommendations = recommend_song("Demonizer",top_k=10)
print(recommendations)

In [None]:
import joblib
df.to_csv("song_recommendation/data/cleaned_songs.csv", index=False)
joblib.dump(tfidf_vectorizer, 'song_recommendation/models/tfidf_vectorizer.pkl')
joblib.dump(tfidf_matrix, 'song_recommendation/models/tfidf_matrix.pkl')
joblib.dump(sim_scores, 'song_recommendation/models/cosine_sim.pkl')