In [1]:
%pip install -q pandas scikit-learn numpy Streamlit llama-index langchain langchain-community gigachat llama-index-embeddings-huggingface ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from langchain.schema import HumanMessage, SystemMessage
from langchain.chat_models.gigachat import GigaChat

from dotenv import load_dotenv

import os

In [3]:
load_dotenv(".env")

True

In [4]:
data = pd.read_csv("data/music_dataset.csv")
data.sample(5)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
19030,58952,kenny kirkland,chance,1991,jazz,still lose voyage wait remember sing realize s...,77,0.001074,0.105867,0.114985,...,0.412713,0.001074,0.33716,0.470528,0.834337,0.884615,0.307502,0.271249,sadness,0.414286
24876,74571,the hooters,500 miles,1985,rock,miss train know go hear whistle blow hear whis...,22,0.002632,0.002632,0.002632,...,0.002632,0.095658,0.511535,0.727303,0.5502,5e-05,0.666117,0.732724,music,0.5
24785,74340,the smiths,still ill,1984,rock,decree today life simply take give england owe...,65,0.001032,0.001032,0.456101,...,0.001032,0.001032,0.442218,0.724816,0.000148,0.079049,0.925804,0.930929,world/life,0.514286
6256,18048,major lazer,watch out for this (bumaye),2013,pop,pull pull sound bounce place sexy gyal gyal du...,166,0.000572,0.313498,0.000572,...,0.000572,0.091227,0.82021,0.808707,0.019678,7e-06,0.882523,0.759752,violence,0.1
12608,40047,aretha franklin,he will wash you white as snow,1956,blues,accord unfailing accord great compassion blot ...,62,0.001949,0.403509,0.001949,...,0.362119,0.001949,0.484458,0.766146,0.98996,6.4e-05,0.602226,0.531517,violence,0.914286


In [5]:
# Выбираем признаки
audio_features = ['danceability', 'loudness',
                  'acousticness', 'instrumentalness', 'valence', 'energy']
lyric_features = ['dating', 'violence', 'world/life', 'night/time', 'shake the audience', 'family/gospel',
                  'romantic', 'communication', 'obscene', 'music', 'movement/places', 'light/visual perceptions',
                  'family/spiritual', 'like/girls', 'sadness', 'feelings']

In [6]:
# Масштабируем числовые признаки
scaler = StandardScaler()
scaled_audio_features = scaler.fit_transform(data[audio_features])
scaled_lyric_features = scaler.fit_transform(data[lyric_features])

In [7]:
# Комбинируем признаки
features = pd.DataFrame(scaled_audio_features, columns=audio_features).join(
    pd.DataFrame(scaled_lyric_features, columns=lyric_features))

In [8]:
# Снижение размерности для эффективности
pca = PCA(n_components=20)
reduced_features = pca.fit_transform(features)

# Создание модели для поиска похожих треков

In [9]:
model = NearestNeighbors(n_neighbors=25, algorithm='kd_tree')
model.fit(reduced_features)

In [10]:
def recommend_tracks(track_name):
    track_id = data.index[data['track_name'] == track_name].tolist()[0]
    _, indices = model.kneighbors([reduced_features[track_id]])
    recommendations = data.iloc[indices[0]]
    return recommendations[['artist_name', 'track_name', 'genre', 'release_date']].to_dict(orient='records')

In [11]:
recommend_tracks("love")

[{'artist_name': 'margo guryan',
  'track_name': 'love',
  'genre': 'pop',
  'release_date': 1968},
 {'artist_name': 'new order',
  'track_name': 'chosen time',
  'genre': 'pop',
  'release_date': 1981},
 {'artist_name': 'built to spill',
  'track_name': 'the plan',
  'genre': 'pop',
  'release_date': 1999},
 {'artist_name': 'rush',
  'track_name': 'anthem',
  'genre': 'rock',
  'release_date': 1975},
 {'artist_name': 'the smashing pumpkins',
  'track_name': 'bullet with butterfly wings',
  'genre': 'pop',
  'release_date': 1995},
 {'artist_name': 'acid king',
  'track_name': 'electric machine',
  'genre': 'rock',
  'release_date': 1999},
 {'artist_name': 'the connells',
  'track_name': 'stone cold yesterday',
  'genre': 'pop',
  'release_date': 1990},
 {'artist_name': 'natalie merchant',
  'track_name': 'kind & generous',
  'genre': 'pop',
  'release_date': 1998},
 {'artist_name': 'the smashing pumpkins',
  'track_name': 'muzzle',
  'genre': 'rock',
  'release_date': 1995},
 {'artist_

# Рекомендации с Retrieval-Augmented Generation (RAG)

In [12]:
documents = SimpleDirectoryReader(
    "data"
).load_data()

In [13]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [14]:
class RAGRecomendator:
    def __init__(self, documents, llm, top_k=5):
        index = VectorStoreIndex.from_documents(
            documents, embed_model=Settings.embed_model)
        self.retriever = index.as_retriever(
            similarity_top_k=top_k, verbose=True)
        self.prompt_template = "Порекоммендуй похожие треки для трека {track_name} на основе контекста: {context}"
        self.messages = [
            SystemMessage(
                content="Ты эксперт по музыке, помогаешь подбирать музыку по контексту."
            )
        ]
        self.llm = llm

    def get_rag_recommendations(self, track_name):
        context = self.retriever.retrieve(track_name)
        prompt = self.prompt_template.format(
            track_name=track_name, context=context)

        self.messages.append(HumanMessage(content=prompt))
        res = self.llm(self.messages)
        # TODO: maybe track_names and llm recommendations should be in separate lists?
        self.messages.append(res)
        return res.content

In [None]:
giga_key = os.environ.get("SB_AUTH_DATA")
giga = GigaChat(credentials=giga_key, model="GigaChat",
                timeout=30, verify_ssl_certs=False)
recommendator = RAGRecomendator(documents, giga)
# This takes over 20 minutes on my device, maybe sensible to try another embedder or idk``

In [20]:
print(recommendator.get_rag_recommendations("Du hast — Rammstein"))

Похожие треки для Du hast — Rammstein:

1. "Bent" — Rammstein
2. "Mein Herz Brennt" — Rammstein
3. "Engel" — Rammstein
4. "Ohne Dich" — Rammstein
5. "Seemann" — Rammstein

Эти треки имеют схожую атмосферу и стиль исполнения, что может сделать их подходящими для вашего контекста.
