In [2]:
import pandas as pd
import os
import glob
from sentence_transformers import InputExample, SentenceTransformer, losses
import pickle
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
file_path = "backend\data\csv"

In [7]:
all_lyrics = []
csv_file = glob.glob(os.path.join(file_path, "*.csv"))
min_word = 5

for file in csv_file:
    try:
        df = pd.read_csv(file)
        df = df[["Artist", "Title", "Album", "Lyric"]].dropna()

        df["Album"] = df["Album"].astype(str).str.strip().str.lower()
        df["Lyric"] = df["Lyric"].astype(str).str.strip()

        df_cleaned = df[~df["Album"].str.contains("unreleased", case=False, na=False)].copy()
        df_cleaned = df_cleaned[df_cleaned["Lyric"].str.split().str.len() >= min_word]


        all_lyrics.append(df_cleaned)
    except Exception as e:
        print(f"Error reading {file}: {e}")

lyrics_df = pd.concat(all_lyrics, ignore_index=True)
lyrics_df.drop_duplicates(subset=["Lyric"], inplace=True)
print(f"Loaded {len(lyrics_df)} unique lyrics")

Loaded 3608 unique lyrics


In [17]:
print(len(lyrics_df))

5640


In [18]:
train_examples = []

for _, row in lyrics_df.iterrows():
    artists = row["Artist"].strip().lower()
    lyrics = row["Lyric"].strip().lower()
    titles = row["Title"].strip().lower()
    if len(lyrics.split()) > 5:
        train_examples.append(InputExample(texts=[lyrics, titles]))
        train_examples.append(InputExample(texts=[lyrics, artists]))

In [19]:
print(len(train_examples))

11196


In [20]:
with open("training_data.pkl", "wb") as f:
    pickle.dump(train_examples, f)

print(f"Saved {len(train_examples)} training pairs to training_data.pkl")  

Saved 11196 training pairs to training_data.pkl


In [21]:
# Load training examples
with open("training_data.pkl", "rb") as f:
    train_examples = pickle.load(f)

# Load pre-trained model
model = SentenceTransformer("all-MiniLM-L6-v2")

# DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Loss function
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tune
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=100,
    output_path="lyrics_sbert_model"
)

print("\n✅ Model fine-tuned and saved to 'lyrics_sbert_model'")

Iteration: 100%|██████████| 700/700 [01:09<00:00, 10.03it/s]
Iteration: 100%|██████████| 700/700 [01:09<00:00, 10.08it/s]
Epoch: 100%|██████████| 2/2 [02:19<00:00, 69.61s/it]


✅ Model fine-tuned and saved to 'lyrics_sbert_model'



