In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import joblib

In [2]:
# Load dataset
data = pd.read_json('katalagu-indonesia-2000an.json')

In [3]:
if isinstance(data['lyrics'].iloc[0], str):
    data['cleaned_lyrics'] = data['lyrics'].str.lower()
elif isinstance(data['lyrics'].iloc[0], list):
    data['cleaned_lyrics'] = data['lyrics'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '').str.lower()
elif isinstance(data['lyrics'].iloc[0], dict):
    data['cleaned_lyrics'] = data['lyrics'].apply(lambda x: x.get('text', '') if isinstance(x, dict) else '')


In [4]:
print(data.head())

                                              lyrics cleaned_lyrics
0  {'artist': 'Acha Septriasa', 'lyric': 'Disini ...               
1  {'artist': 'Acha Septriasa', 'lyric': 'Embun D...               
2  {'artist': 'Ada Band', 'lyric': '[Verse 1]
Bag...               
3  {'artist': 'Ada Band', 'lyric': 'Betapa dalam,...               
4  {'artist': 'Ada Band', 'lyric': '[Verse 1]
Lek...               


In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   lyrics          265 non-null    object
 1   cleaned_lyrics  265 non-null    object
dtypes: object(2)
memory usage: 4.3+ KB
None


In [6]:
data['cleaned_lyrics'] = data['cleaned_lyrics'].fillna('')

In [7]:
print(data['lyrics'].head(10))

0    {'artist': 'Acha Septriasa', 'lyric': 'Disini ...
1    {'artist': 'Acha Septriasa', 'lyric': 'Embun D...
2    {'artist': 'Ada Band', 'lyric': '[Verse 1]
Bag...
3    {'artist': 'Ada Band', 'lyric': 'Betapa dalam,...
4    {'artist': 'Ada Band', 'lyric': '[Verse 1]
Lek...
5    {'artist': 'Ada Band', 'lyric': 'Malam kehadir...
6    {'artist': 'Ada Band', 'lyric': 'Ingin kumerai...
7    {'artist': 'Ada Band', 'lyric': 'Manja...kau d...
8    {'artist': 'Ada Band', 'lyric': '[Verse 1]
Dah...
9    {'artist': 'Ada Band', 'lyric': 'Rasa cinta ya...
Name: lyrics, dtype: object


In [8]:
data['cleaned_lyrics'] = data['lyrics'].apply(lambda x: x['lyric'].lower() if isinstance(x, dict) and 'lyric' in x else '')
data['cleaned_lyrics'] = data['cleaned_lyrics'].fillna('')
print(data['cleaned_lyrics'].head(10))

0    disini kau dan aku\nterbiasa bersama\nmenjalan...
1    embun di pagi buta\nmenebarkan bau asa\ndetik ...
2    [verse 1]\nbagaimana mestinya\nmembuatmu jatuh...
3    betapa dalam, rasa kasihku padamu\nyang slalu ...
4    [verse 1]\nlekuk indah hadirkan pesona\nkemuli...
5    malam kehadiran cinta sambut jiwa baru\ntelah ...
6    ingin kumeraih bintang\nkuingin seperti bintan...
7    manja...kau dambaku\nselalu mengusik hatiku\ns...
8    [verse 1]\ndahulu terasa indah\ntak ingin lupa...
9    rasa cinta yang dulu tlah hilang\nkini bersemi...
Name: cleaned_lyrics, dtype: object


In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['cleaned_lyrics'])

In [10]:
def search_songs(query, vectorizer, X, data):
    # Transform the query to the same TF-IDF space
    query_vec = vectorizer.transform([query])
    
    # Calculate cosine similarity between the query and the song lyrics
    cosine_similarities = np.dot(X, query_vec.T).toarray().flatten()
    
    # Get indices of the songs sorted by similarity
    similar_indices = cosine_similarities.argsort()[::-1]
    
    # Get the most similar songs (you can adjust the number)
    top_n = 5  # Number of top results to return
    similar_songs = data.iloc[similar_indices[:top_n]]
    
    return similar_songs

In [11]:
model = NearestNeighbors(n_neighbors=5, metric='cosine').fit(X)

In [12]:
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(model, 'reff_model.pkl')

print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.
