In [1]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sklearn.preprocessing import StandardScaler


In [2]:
import pandas as pd

features_df = pd.read_csv("track_full.csv")
meta_df = features_df.drop_duplicates(subset="track_name", keep="first")

meta_df.to_csv("track_full.csv", index=False)

In [3]:
len(meta_df)

660

In [4]:
scaler = StandardScaler()

VECTOR_COLS = [
    "danceability", "energy", "loudness", "speechiness",
    "acousticness", "instrumentalness", "liveness",
    "valence", "tempo"
]
COLLECTION_NAME = "music_recommendation"

In [5]:
audio_df = pd.read_csv("track_full.csv")
audio_df[VECTOR_COLS] = scaler.fit_transform(audio_df[VECTOR_COLS])
audio_df["vector"] = audio_df[VECTOR_COLS].values.tolist()

audio_df.head()

Unnamed: 0,track_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,external_urls_artists,track_album_id,album_name,release_date,image_album,external_urls_albums,duration_ms_y,explicit,genres,vector
0,0W4NhJhcqKCqEP2GIpDCDq,0.192651,-1.390044,0.591375,-0.813137,-0.354967,-0.163472,-0.589958,-1.041481,-0.086112,...,https://open.spotify.com/artist/1vfezMIyCr4XUd...,4zEAsP0RIfbGARdtsgAaUV,The Way It Is,2005-01-01,https://i.scdn.co/image/ab67616d0000b2732cc232...,https://open.spotify.com/album/4zEAsP0RIfbGARd...,255333,False,"hip hop, R&B, pop, r&b","[0.19265127649443695, -1.39004425311897, 0.591..."
1,0qcjuYtMWhBjXg0Xwt5SzS,1.943854,-0.641122,0.99092,-0.634743,-0.38985,-0.162936,0.692828,1.93694,0.072883,...,https://open.spotify.com/artist/7dGJo4pcD2V6oG...,1kTlYbs28MXw7hwO0NLYif,Encore (Deluxe Version),2004-11-12,https://i.scdn.co/image/ab67616d0000b273726d48...,https://open.spotify.com/album/1kTlYbs28MXw7hw...,248680,True,"hip hop, rap","[1.9438542226908004, -0.6411224514385454, 0.99..."
2,2aibwv5hGXSgw7Yru8IYTO,-1.621095,1.112931,0.928095,-0.603718,-0.047528,-0.16324,-0.455301,0.25811,-0.533143,...,https://open.spotify.com/artist/0L8ExT028jH3dd...,7xl50xr9NDkd3i2kBbzsNZ,Stadium Arcadium,2006-05-09,https://i.scdn.co/image/ab67616d0000b27309fd83...,https://open.spotify.com/album/7xl50xr9NDkd3i2...,334666,False,"alternative rock, funk rock, rock","[-1.621094632066082, 1.112931241970871, 0.9280..."
3,3GhuNU58hVUuzKY7LHmFRB,2.145382,-0.194397,-1.182054,0.424954,0.094368,-0.142115,0.125851,1.673322,0.087303,...,https://open.spotify.com/artist/2wIVse2owClT7g...,20t54K6C80QQH7vbcpfJcP,Miss E... So Addictive,2001-05-14,https://i.scdn.co/image/ab67616d0000b2732e3969...,https://open.spotify.com/album/20t54K6C80QQH7v...,289373,True,"hip hop, R&B, pop","[2.1453815458641916, -0.19439716622566067, -1...."
4,3xgT3xIlFGqZjYW9QlhJWp,0.56096,-1.383475,-1.187565,-0.451502,1.927182,-0.163446,0.274682,-0.90736,-0.594569,...,https://open.spotify.com/artist/2wY79sveU1sp5g...,5XCBX16KNYsAe7V5hQV9mC,Love Goes,2020-10-30,https://i.scdn.co/image/ab67616d0000b2730d1f39...,https://open.spotify.com/album/5XCBX16KNYsAe7V...,171029,False,"soft pop, pop","[0.5609598326389105, -1.3834747636305453, -1.1..."


In [6]:
meta_df = pd.read_csv("track_full.csv", on_bad_lines='skip')

meta_df[["track_id", "track_name", "artist_name"]].head()

Unnamed: 0,track_id,track_name,artist_name
0,0W4NhJhcqKCqEP2GIpDCDq,Love,Keyshia Cole
1,0qcjuYtMWhBjXg0Xwt5SzS,Just Lose It,Eminem
2,2aibwv5hGXSgw7Yru8IYTO,Snow (Hey Oh),Red Hot Chili Peppers
3,3GhuNU58hVUuzKY7LHmFRB,4 My People (feat. Eve),Missy Elliott
4,3xgT3xIlFGqZjYW9QlhJWp,Dancing With A Stranger (with Normani),Sam Smith


In [7]:
client = QdrantClient(host="qdrant", port=6333)

def create_collection():
    client.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(
            size=len(VECTOR_COLS),
            distance=Distance.COSINE
        )
    )
    
    points = [
        PointStruct(
            id=int(i),
            vector=row["vector"],
            payload={"track_id": row["track_id"]}
        )
        for i, row in audio_df.iterrows()
    ]
    
    client.upsert(collection_name=COLLECTION_NAME, points=points)

In [8]:
def search_similar_tracks(track_id, k=5):
    vector = audio_df[audio_df["track_id"] == track_id]["vector"].values[0]
    results = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=vector,
        limit=k + 1
    )
    return [
        (r.payload["track_id"], r.score)
        for r in results
        if r.payload["track_id"] != track_id
    ][:k]


In [9]:
def get_track_info(track_id):
    return meta_df[meta_df["track_id"] == track_id].iloc[0].to_dict()
def search_by_name(name):
    return meta_df[meta_df["track_name"].str.contains(name, case=False)]

In [10]:
create_collection()

query = input("Nhập tên bài hát: ")
matches = search_by_name(query)

if matches.empty:
    print("Không tìm thấy bài hát.")
else:
    print("Đang phát bài đầu tiên khớp:")
    main_track = matches.iloc[0]
    print(f"{main_track['track_name']} - {main_track['artist_name']}")

    selected_track_id = main_track["track_id"]
    similar_tracks = search_similar_tracks(selected_track_id)

    print("Gợi ý bài hát tương tự:")
    for track_id, score in similar_tracks:
        info = get_track_info(track_id)
        print(f"- {info['track_name']} – {info['artist_name']} (Score: {score:.4f})")


  client.recreate_collection(


Nhập tên bài hát:  we


Đang phát bài đầu tiên khớp:
We Made You - Eminem
Gợi ý bài hát tương tự:
- Forgot About Dre – Dr. Dre (Score: 0.9360)
- The Monster – Eminem (Score: 0.9050)
- China – Anuel AA (Score: 0.8960)
- My Way – Calvin Harris (Score: 0.8956)
- One Kiss (with Dua Lipa) – Calvin Harris (Score: 0.8913)


  results = client.search(
