# TF-IDF with ngram

## Simple

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import math
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")

nltk.download('punkt')
nltk.download('punkt_tab')


lyrics_df = pd.read_excel('/content/databersih (2).xlsx')
ground_truth_df = pd.read_excel('/content/GT UAS NLP_simple.xlsx')

stemmer = PorterStemmer()
remove_words = ["song", "with", "lyrics", "from", "the", "album", "released", "in", "before", "after", "since", "s"]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
def clean_lyrics(text):
    if isinstance(text, str):
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'\(.*?\)', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word) for word in tokens if word not in remove_words]
        text = ' '.join(tokens)
    return text

lyrics_df['Processed_Lyrics'] = lyrics_df['lyrics'].apply(clean_lyrics)

print("\nCleaned Lyrics Sample:")
print(lyrics_df[['title', 'Processed_Lyrics']].head())

def parse_song_ids(song_id_entry):
    if pd.isnull(song_id_entry):
        return []
    if isinstance(song_id_entry, int):
        return [song_id_entry]
    if isinstance(song_id_entry, float) and np.isnan(song_id_entry):
        return []
    song_id_str = str(song_id_entry)
    return [int(id_.strip()) for id_ in song_id_str.split(',') if id_.strip().isdigit()]

ground_truth_df['Relevant_Song_IDs'] = ground_truth_df['song_id'].apply(parse_song_ids)
ground_truth_df = ground_truth_df[ground_truth_df['Relevant_Song_IDs'].map(len) > 0].reset_index(drop=True)
print("\nParsed Ground Truth:")
print(ground_truth_df[['query', 'Relevant_Song_IDs', 'total']].head())

def preprocess_query(query):
    if isinstance(query, str):
        query = re.sub(r'[^\w\s]', '', query)
        query = re.sub(r'\d+', '', query)
        query = query.lower()
        tokens = word_tokenize(query)
        tokens = [stemmer.stem(word) for word in tokens if word not in remove_words]
        return ' '.join(tokens)
    return query

ground_truth_df['Processed_Query'] = ground_truth_df['query'].apply(preprocess_query)


Cleaned Lyrics Sample:
                   title                                   Processed_Lyrics
0      Chasing Pavements  ive made up my mind dont need to think it over...
1          Cold Shoulder  you say it all my head and thing i think just ...
2         Hometown Glory  ive been walk same way as i did miss out crack...
3  Make You Feel My Love  when rain is blow your face and whole world is...
4                My Same  aye aye ayeay aye aye ayeay aye aye ayeay aye ...

Parsed Ground Truth:
           query                                  Relevant_Song_IDs  total
0    love you in          [129, 1433, 1977, 2978, 3214, 3320, 4514]      7
1   home tonight  [561, 1019, 1455, 1671, 1686, 2428, 2434, 3126...     12
2  something new  [20, 484, 671, 783, 999, 1463, 2556, 2607, 264...     18
3     i held you                             [44, 1520, 4082, 4503]      4
4  stars tonight                                   [95, 2142, 3177]      3


In [None]:
ground_truth_df

Unnamed: 0,query,song_id,total,Relevant_Song_IDs,Processed_Query
0,love you in,"129, 1433, 1977, 2978, 3214, 3320, 4514",7,"[129, 1433, 1977, 2978, 3214, 3320, 4514]",love you
1,home tonight,"561, 1019, 1455, 1671, 1686, 2428, 2434, 3126,...",12,"[561, 1019, 1455, 1671, 1686, 2428, 2434, 3126...",home tonight
2,something new,"20, 484, 671, 783, 999, 1463, 2556, 2607, 2642...",18,"[20, 484, 671, 783, 999, 1463, 2556, 2607, 264...",someth new
3,i held you,"44, 1520, 4082, 4503",4,"[44, 1520, 4082, 4503]",i held you
4,stars tonight,"95, 2142, 3177",3,"[95, 2142, 3177]",star tonight
5,know about me,"51, 1261, 1293, 2721, 2729, 3426, 3858, 4865",8,"[51, 1261, 1293, 2721, 2729, 3426, 3858, 4865]",know about me
6,stick together,"227, 2325",2,"[227, 2325]",stick togeth
7,easy come,"239, 530, 2137, 3407, 4471",5,"[239, 530, 2137, 3407, 4471]",easi come
8,haunt me,"257, 723, 966, 1290, 1498, 2245, 2618, 3433",8,"[257, 723, 966, 1290, 1498, 2245, 2618, 3433]",haunt me
9,i want your love,"542, 1036, 1992, 3811",4,"[542, 1036, 1992, 3811]",i want your love


In [None]:
lyrics_df

Unnamed: 0,no,artist,album,year,title,lyrics,Processed_Lyrics
0,1,Adele,19,2008,Chasing Pavements,I've made up my mind\nDon't need to think it o...,ive made up my mind dont need to think it over...
1,2,Adele,19,2008,Cold Shoulder,\nYou say it's all in my head\nAnd the things ...,you say it all my head and thing i think just ...
2,3,Adele,19,2008,Hometown Glory,I've been walking in the same way as I did\nMi...,ive been walk same way as i did miss out crack...
3,4,Adele,19,2008,Make You Feel My Love,When the rain is blowing in your face\nAnd the...,when rain is blow your face and whole world is...
4,5,Adele,19,2008,My Same,"Aye, aye, aye-aye\nAye, aye, aye-aye\nAye, aye...",aye aye ayeay aye aye ayeay aye aye ayeay aye ...
...,...,...,...,...,...,...,...
4907,4908,XXXTentacion,The Fall,2014,The Fall,Fool's gold is a common man's trash\nI've seen...,fool gold is a common man trash ive seen god i...
4908,4909,XXXTentacion,The Fall,2014,​ghost,Shtuom ruo neewteb seid ecapS tuo dna ni revO\...,shtuom ruo neewteb seid ecap tuo dna ni revo m...
4909,4910,XXXTentacion,The Fall,2014,​white girl,"Haha\nYou know, gang, gang, bitch\nXXX, pussy ...",haha you know gang gang bitch xxx pussi boy fa...
4910,4911,XXXTentacion,Willy Wonka Was a Child Murderer,2016,Willy Wonka Was a Child Murderer,"Yeah\nIt's all in my, it's all in my head\nIt'...",yeah it all my it all my head it all my it all...


In [None]:
data = lyrics_df

In [None]:
import networkx as nx
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

kg = nx.Graph()

for _, row in data.iterrows():
    song_id = row['no']

    kg.add_node(row['title'], type='song', song_id=song_id)
    kg.add_node(row['artist'], type='artist')
    kg.add_edge(row['title'], row['artist'], relation='sung_by')

    kg.add_node(row['lyrics'], type='lyrics')
    kg.add_edge(row['title'], row['lyrics'], relation='has_lyric')

    if pd.notnull(row.get('album', None)):
        album = row['album']
        kg.add_node(album, type='album')
        kg.add_edge(row['title'], album, relation='has_album')
        kg.add_edge(row['artist'], album, relation='created_album')

    if pd.notnull(row.get('year', None)):
        year = row['year']
        kg.add_node(year, type='year')
        kg.add_edge(row['title'], year, relation='released_in')
        kg.add_edge(row['artist'], year, relation='active_in')

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data['Processed_Lyrics'])

cosine_sim = cosine_similarity(tfidf_matrix)

threshold = 0.1

for i in range(len(data)):
    for j in range(i + 1, len(data)):
        score = cosine_sim[i, j]
        if score > threshold:
            song1 = data.iloc[i]['title']
            song2 = data.iloc[j]['title']

            if not kg.has_edge(song1, song2):
                kg.add_edge(song1, song2, relation='related_to', similarity_score=score)

nx.write_graphml(kg, "K_Graph.graphml")

print("Graph berhasil disimpan.")


Graph berhasil disimpan.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def precision_at_k(relevant, retrieved, k):
    """
    Calculate precision at k.
    relevant: List of relevant document IDs.
    retrieved: List of retrieved document IDs.
    k: The number of top retrieved documents to evaluate.
    """
    if k == 0:
        return 0.0
    retrieved_k = retrieved[:k]

    if isinstance(relevant, set):
        relevant = list(relevant)

    if isinstance(retrieved_k[0], dict):
        retrieved_k = [tuple(d.items()) for d in retrieved_k]
    if isinstance(relevant[0], dict):
        relevant = [tuple(d.items()) for d in relevant]

    relevant_retrieved = set(retrieved_k).intersection(relevant)
    return len(relevant_retrieved) / k


def recall_at_k(relevant, retrieved, k):
    """
    Calculate recall at k.
    relevant: List of relevant document IDs.
    retrieved: List of retrieved document IDs.
    k: The number of top retrieved documents to evaluate.
    """
    if len(relevant) == 0:
        return 0.0
    retrieved_k = retrieved[:k]

    relevant_retrieved = set(retrieved_k).intersection(relevant)
    return len(relevant_retrieved) / len(relevant)


def f1_at_k(precision, recall):
    """
    Calculate the F1 score at k.
    precision: Precision value.
    recall: Recall value.
    """
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def average_precision_func(relevant, retrieved):
    """
    Calculate average precision for a query.
    relevant: List of relevant document IDs.
    retrieved: List of retrieved document IDs.
    """
    if len(relevant) == 0:
        return 0.0

    ap = 0.0
    num_relevant = 0
    for i, doc_id in enumerate(retrieved, start=1):
        if doc_id in relevant:
            num_relevant += 1
            ap += num_relevant / i

    return ap / len(relevant)


def evaluate_query(relevant_ids, retrieved_docs, k_values=[3, 6, 10]):
    """
    Evaluate a single query's results at given k-values and return metrics in a dictionary.
    relevant_ids: List of relevant document IDs.
    retrieved_docs: List of retrieved document IDs.
    k_values: List of k-values for evaluating precision and recall at different cut-off points.
    """
    evaluation = {}
    for k in k_values:
        p = precision_at_k(relevant_ids, retrieved_docs, k)
        r = recall_at_k(relevant_ids, retrieved_docs, k)
        f1 = f1_at_k(p, r)

        evaluation[f'Precision@{k}'] = p
        evaluation[f'Recall@{k}'] = r
        evaluation[f'F1@{k}'] = f1

    ap = average_precision_func(relevant_ids, retrieved_docs)
    evaluation['Average Precision'] = ap

    return evaluation


def retrieve_with_tfidf(data_df, query, ngram_range=(1, 1), top_k=10):
    """
    Retrieve the top-k most relevant documents based on TF-IDF cosine similarity.
    data_df: DataFrame containing the documents.
    query: The query string.
    ngram_range: The range of n-grams for TF-IDF.
    top_k: The number of top documents to return.
    """
    tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(data_df['Processed_Lyrics'])

    query_vector = tfidf_vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vector, tfidf_matrix).flatten()

    top_indices = cosine_sim.argsort()[-top_k:][::-1]

    retrieved_docs = []
    for idx in top_indices:
        doc = data_df.iloc[idx]
        retrieved_docs.append({
            'id': doc['no'],
            'artist': doc.get('artist', None),
            'album': doc.get('album', None),
            'tfidf_score': cosine_sim[idx]
        })
    return retrieved_docs


In [None]:
def get_artist_graph_score(artist, kg, query):
    """
    Menghitung skor berdasarkan apakah artis ada dalam graph dan relevansi dengan query.
    Jika artis ada dalam graph dan relevan dengan query, skor ditambah.
    """
    if kg.has_node(artist):
        if artist.lower() in query.lower():
            return 1
    return 0

def get_album_graph_score(album, kg, query):
    """
    Menghitung skor berdasarkan apakah album ada dalam graph dan relevansi dengan query.
    Jika album ada dalam graph dan relevan dengan query, skor ditambah.
    """
    if kg.has_node(album):
        if album.lower() in query.lower():
            return 1
    return 0


def parse_query_for_graph(query, data):
    """
    Analisis query untuk menemukan artis, album, dan tahun.
    """
    artist = None
    album = None
    year = None

    for known_artist in data['artist']:
        if known_artist.lower() in query.lower():
            artist = known_artist
            break

    for known_album in data['album']:
        if pd.notnull(known_album) and known_album.lower() in query.lower():
            album = known_album
            break

    for known_year in data['year']:
        if str(known_year) in query:
            year = known_year
            break

    return artist, album, year

In [None]:
import pandas as pd

k_values = [3, 6, 10]
top_k = 10
all_ngram_evaluations = []

for n in range(1, 6):
    evaluation_results = []

    for idx, row in ground_truth_df.iterrows():
        query = row['Processed_Query']
        relevant_ids = set(row['Relevant_Song_IDs'])

        artist, album, year = parse_query_for_graph(query, lyrics_df)

        tfidf_retrieved_docs = retrieve_with_tfidf(
            lyrics_df, query, ngram_range=(n, n), top_k=top_k
        )

        for doc in tfidf_retrieved_docs:
            doc_id = doc['id']

            if doc_id in lyrics_df['no'].values:
                lyric_row = lyrics_df.loc[lyrics_df['no'] == doc_id]
                artist_name = lyric_row['artist'].values[0] if not lyric_row['artist'].isnull().all() else None
                album_name = lyric_row['album'].values[0] if not lyric_row['album'].isnull().all() else None

                if artist_name:
                    doc['artist_score'] = get_artist_graph_score(artist_name, kg, query)
                else:
                    print(f"Warning: artist_name is invalid for doc_id {doc_id}")
                    doc['artist_score'] = 0

                if album_name:
                    doc['album_score'] = get_album_graph_score(album_name, kg, query)
                else:
                    print(f"Warning: album_name is invalid for doc_id {doc_id}")
                    doc['album_score'] = 0

                doc['combined_score'] = (
                    doc.get('tfidf_score', 0)
                    + doc.get('artist_score', 0)
                    + doc.get('album_score', 0)
                )
            else:
                print(f"Warning: doc_id {doc_id} not found in lyrics_df")

        combined_retrieved_docs = sorted(
            tfidf_retrieved_docs, key=lambda x: x['combined_score'], reverse=True
        )[:top_k]
        combined_retrieved_ids = [doc['id'] for doc in combined_retrieved_docs]

        evaluation = evaluate_query(relevant_ids, combined_retrieved_ids, k_values)
        evaluation['Query'] = row['Processed_Query']
        evaluation['N-gram'] = n
        evaluation_results.append(evaluation)

    eval_df = pd.DataFrame(evaluation_results)
    print(f"\nEvaluation Results for {n}-gram:")
    print(eval_df.head())

    macro_results = {
        "Metric": [
            "Precision@3", "Recall@3", "F1@3",
            "Precision@6", "Recall@6", "F1@6",
            "Precision@10", "Recall@10", "F1@10",
            "Average Precision"
        ],
        "Macro Average": [
            eval_df['Precision@3'].mean(),
            eval_df['Recall@3'].mean(),
            eval_df['F1@3'].mean(),
            eval_df['Precision@6'].mean(),
            eval_df['Recall@6'].mean(),
            eval_df['F1@6'].mean(),
            eval_df['Precision@10'].mean(),
            eval_df['Recall@10'].mean(),
            eval_df['F1@10'].mean(),
            eval_df['Average Precision'].mean()
        ]
    }
    macro_df = pd.DataFrame(macro_results)
    macro_df['N-gram'] = n
    print(f"\nMacro Averages for {n}-gram:")
    print(macro_df)

    eval_df.to_excel(f'evaluation_combined_tfidf_graph_{n}gram.xlsx', index=False)
    macro_df.to_excel(f'macro_averages_combined_graph_{n}gram.xlsx', index=False)

    all_ngram_evaluations.append(macro_df)

combined_macro = pd.concat(all_ngram_evaluations, ignore_index=True)
combined_macro.to_excel('combined_macro_averages_all_ngrams_with_graph.xlsx', index=False)
print("\nAll macro averages for all n-grams have been saved to 'combined_macro_averages_all_ngrams_with_graph.xlsx'.")



Evaluation Results for 1-gram:
   Precision@3  Recall@3      F1@3  Precision@6  Recall@6      F1@6  \
0     0.000000  0.000000  0.000000     0.000000  0.000000  0.000000   
1     0.000000  0.000000  0.000000     0.000000  0.000000  0.000000   
2     0.333333  0.055556  0.095238     0.333333  0.111111  0.166667   
3     0.000000  0.000000  0.000000     0.000000  0.000000  0.000000   
4     0.000000  0.000000  0.000000     0.166667  0.333333  0.222222   

   Precision@10  Recall@10     F1@10  Average Precision         Query  N-gram  
0           0.0   0.000000  0.000000           0.000000      love you       1  
1           0.0   0.000000  0.000000           0.000000  home tonight       1  
2           0.2   0.111111  0.142857           0.074074    someth new       1  
3           0.1   0.250000  0.142857           0.031250    i held you       1  
4           0.1   0.333333  0.153846           0.066667  star tonight       1  

Macro Averages for 1-gram:
              Metric  Macro Avera

In [None]:
from collections import defaultdict

def rrf_fusion(ranked_lists, rrf_k=60):
    """
    Perform Reciprocal Rank Fusion (RRF) on multiple ranked lists.

    Parameters:
    - ranked_lists: List of lists, where each sublist contains document IDs ordered by relevance.
    - rrf_k: Constant to control the influence of rank.

    Returns:
    - fused_docs: List of document IDs ordered by their RRF scores.
    """
    rrf_scores = defaultdict(float)
    for ranked_list in ranked_lists:
        for rank, doc in enumerate(ranked_list):
            if isinstance(doc, dict):
                doc_id = doc.get('id')
            else:
                doc_id = doc
            rrf_scores[doc_id] += 1.0 / (rrf_k + rank + 1)
    fused_ranking = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    fused_docs = [doc_id for doc_id, score in fused_ranking]
    return fused_docs


rrf_evaluations = []

for idx, row in ground_truth_df.iterrows():
    query = row['Processed_Query']
    relevant_ids = set(row['Relevant_Song_IDs'])

    ranked_lists = []
    for n in range(1, 6):
        retrieved_docs = retrieve_with_tfidf(lyrics_df, query, ngram_range=(n,n), top_k=top_k)
        ranked_lists.append(retrieved_docs)

    fused_docs = rrf_fusion(ranked_lists, rrf_k=60)

    rrf_eval = evaluate_query(relevant_ids, fused_docs, k_values)
    rrf_eval['Query'] = row['query']
    rrf_evaluations.append(rrf_eval)

rrf_df = pd.DataFrame(rrf_evaluations)
print("\nRRF Fused Evaluation Results:")
print(rrf_df.head())

rrf_macro_results = {
    "Metric": [
        "Precision",
        "Recall",
        "F1",
        "Precision@3",
        "Precision@6",
        "Precision@10",
        "Average Precision"
    ],
    "Macro Average": [
        rrf_df['Precision@10'].mean(),
        rrf_df['Recall@10'].mean(),
        rrf_df['F1@10'].mean(),
        rrf_df['Precision@3'].mean(),
        rrf_df['Precision@6'].mean(),
        rrf_df['Precision@10'].mean(),
        rrf_df['Average Precision'].mean()
    ]
}
rrf_macro_df = pd.DataFrame(rrf_macro_results)
rrf_macro_df['N-gram'] = 'RRF Fusion'
print("\nMacro Averages for RRF Fusion:")
print(rrf_macro_df)

rrf_df.to_excel('evaluation_tfidf_rrf_fusion.xlsx', index=False)
rrf_macro_df.to_excel('macro_averages_rrf_fusion.xlsx', index=False)
print("\nRRF Fused evaluation results have been saved to 'evaluation_tfidf_rrf_fusion.xlsx'.")
print("Macro averages for RRF Fusion have been saved to 'macro_averages_rrf_fusion.xlsx'.")



RRF Fused Evaluation Results:
   Precision@3  Recall@3  F1@3  Precision@6  Recall@6  F1@6  Precision@10  \
0          0.0       0.0   0.0          0.0       0.0   0.0           0.0   
1          0.0       0.0   0.0          0.0       0.0   0.0           0.0   
2          0.0       0.0   0.0          0.0       0.0   0.0           0.0   
3          0.0       0.0   0.0          0.0       0.0   0.0           0.0   
4          0.0       0.0   0.0          0.0       0.0   0.0           0.0   

   Recall@10  F1@10  Average Precision          Query  
0        0.0    0.0           0.000000    love you in  
1        0.0    0.0           0.201063   home tonight  
2        0.0    0.0           0.149886  something new  
3        0.0    0.0           0.148310     i held you  
4        0.0    0.0           0.148252  stars tonight  

Macro Averages for RRF Fusion:
              Metric  Macro Average      N-gram
0          Precision       0.088000  RRF Fusion
1             Recall       0.187857  RRF F

# Complex

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import math
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings("ignore")

nltk.download('punkt')
nltk.download('punkt_tab')


lyrics_df = pd.read_excel('/content/databersih (2).xlsx')
ground_truth_df = pd.read_excel('/content/GT UAS NLP_complex.xlsx')

stemmer = PorterStemmer()
remove_words = ["song", "with", "lyrics", "from", "the", "album", "released", "in", "before", "after", "since", "s"]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def clean_lyrics(text):
    if isinstance(text, str):
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'\(.*?\)', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word) for word in tokens if word not in remove_words]
        text = ' '.join(tokens)
    return text

lyrics_df['Processed_Lyrics'] = lyrics_df['lyrics'].apply(clean_lyrics)

print("\nCleaned Lyrics Sample:")
print(lyrics_df[['title', 'Processed_Lyrics']].head())

def parse_song_ids(song_id_entry):
    if pd.isnull(song_id_entry):
        return []
    if isinstance(song_id_entry, int):
        return [song_id_entry]
    if isinstance(song_id_entry, float) and np.isnan(song_id_entry):
        return []
    song_id_str = str(song_id_entry)
    return [int(id_.strip()) for id_ in song_id_str.split(',') if id_.strip().isdigit()]

ground_truth_df['Relevant_Song_IDs'] = ground_truth_df['song_id'].apply(parse_song_ids)
ground_truth_df = ground_truth_df[ground_truth_df['Relevant_Song_IDs'].map(len) > 0].reset_index(drop=True)
print("\nParsed Ground Truth:")
print(ground_truth_df[['query', 'Relevant_Song_IDs', 'total']].head())

def preprocess_query(query):
    if isinstance(query, str):
        query = re.sub(r'[^\w\s]', '', query)
        query = re.sub(r'\d+', '', query)
        query = query.lower()
        tokens = word_tokenize(query)
        tokens = [stemmer.stem(word) for word in tokens if word not in remove_words]
        return ' '.join(tokens)
    return query

ground_truth_df['Processed_Query'] = ground_truth_df['query'].apply(preprocess_query)


Cleaned Lyrics Sample:
                   title                                   Processed_Lyrics
0      Chasing Pavements  ive made up my mind dont need to think it over...
1          Cold Shoulder  you say it all my head and thing i think just ...
2         Hometown Glory  ive been walk same way as i did miss out crack...
3  Make You Feel My Love  when rain is blow your face and whole world is...
4                My Same  aye aye ayeay aye aye ayeay aye aye ayeay aye ...

Parsed Ground Truth:
                                               query Relevant_Song_IDs  total
0               Adele's song with lyrics love you in            [1433]      1
1    Billie Eilish's song with lyrics stick together             [227]      1
2              Bruno Mars' song with lyrics kiss you            [2129]      1
3    Jennifer Lopez's song with lyrics you feel left             [788]      1
4  Song released in 2024 with lyrics hopeless rom...            [3019]      1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def precision_at_k(relevant, retrieved, k):
    """
    Calculate precision at k.
    relevant: List of relevant document IDs.
    retrieved: List of retrieved document IDs.
    k: The number of top retrieved documents to evaluate.
    """
    if k == 0:
        return 0.0
    retrieved_k = retrieved[:k]

    if isinstance(relevant, set):
        relevant = list(relevant)

    if isinstance(retrieved_k[0], dict):
        retrieved_k = [tuple(d.items()) for d in retrieved_k]
    if isinstance(relevant[0], dict):
        relevant = [tuple(d.items()) for d in relevant]

    relevant_retrieved = set(retrieved_k).intersection(relevant)
    return len(relevant_retrieved) / k


def recall_at_k(relevant, retrieved, k):
    """
    Calculate recall at k.
    relevant: List of relevant document IDs.
    retrieved: List of retrieved document IDs.
    k: The number of top retrieved documents to evaluate.
    """
    if len(relevant) == 0:
        return 0.0
    retrieved_k = retrieved[:k]

    relevant_retrieved = set(retrieved_k).intersection(relevant)
    return len(relevant_retrieved) / len(relevant)


def f1_at_k(precision, recall):
    """
    Calculate the F1 score at k.
    precision: Precision value.
    recall: Recall value.
    """
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def average_precision_func(relevant, retrieved):
    """
    Calculate average precision for a query.
    relevant: List of relevant document IDs.
    retrieved: List of retrieved document IDs.
    """
    if len(relevant) == 0:
        return 0.0

    ap = 0.0
    num_relevant = 0
    for i, doc_id in enumerate(retrieved, start=1):
        if doc_id in relevant:
            num_relevant += 1
            ap += num_relevant / i

    return ap / len(relevant)


def evaluate_query(relevant_ids, retrieved_docs, k_values=[3, 6, 10]):
    """
    Evaluate a single query's results at given k-values and return metrics in a dictionary.
    relevant_ids: List of relevant document IDs.
    retrieved_docs: List of retrieved document IDs.
    k_values: List of k-values for evaluating precision and recall at different cut-off points.
    """
    evaluation = {}
    for k in k_values:
        p = precision_at_k(relevant_ids, retrieved_docs, k)
        r = recall_at_k(relevant_ids, retrieved_docs, k)
        f1 = f1_at_k(p, r)

        evaluation[f'Precision@{k}'] = p
        evaluation[f'Recall@{k}'] = r
        evaluation[f'F1@{k}'] = f1

    ap = average_precision_func(relevant_ids, retrieved_docs)
    evaluation['Average Precision'] = ap

    return evaluation


def retrieve_with_tfidf(data_df, query, ngram_range=(1, 1), top_k=10):
    """
    Retrieve the top-k most relevant documents based on TF-IDF cosine similarity.
    data_df: DataFrame containing the documents.
    query: The query string.
    ngram_range: The range of n-grams for TF-IDF.
    top_k: The number of top documents to return.
    """
    tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    tfidf_matrix = tfidf_vectorizer.fit_transform(data_df['Processed_Lyrics'])

    query_vector = tfidf_vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vector, tfidf_matrix).flatten()

    top_indices = cosine_sim.argsort()[-top_k:][::-1]

    retrieved_docs = []
    for idx in top_indices:
        doc = data_df.iloc[idx]
        retrieved_docs.append({
            'id': doc['no'],
            'artist': doc.get('artist', None),
            'album': doc.get('album', None),
            'tfidf_score': cosine_sim[idx]
        })
    return retrieved_docs


In [None]:
def get_artist_graph_score(artist, kg, query):
    """
    Menghitung skor berdasarkan apakah artis ada dalam graph dan relevansi dengan query.
    Jika artis ada dalam graph dan relevan dengan query, skor ditambah.
    """
    if kg.has_node(artist):
        if artist.lower() in query.lower():
            return 1
    return 0

def get_album_graph_score(album, kg, query):
    """
    Menghitung skor berdasarkan apakah album ada dalam graph dan relevansi dengan query.
    Jika album ada dalam graph dan relevan dengan query, skor ditambah.
    """
    if kg.has_node(album):
        if album.lower() in query.lower():
            return 1
    return 0


def parse_query_for_graph(query, data):
    """
    Analisis query untuk menemukan artis, album, dan tahun.
    """
    artist = None
    album = None
    year = None

    for known_artist in data['artist']:
        if known_artist.lower() in query.lower():
            artist = known_artist
            break

    for known_album in data['album']:
        if pd.notnull(known_album) and known_album.lower() in query.lower():
            album = known_album
            break

    for known_year in data['year']:
        if str(known_year) in query:
            year = known_year
            break

    return artist, album, year

In [None]:
import pandas as pd

k_values = [3, 6, 10]
top_k = 10
all_ngram_evaluations = []

for n in range(1, 6):
    evaluation_results = []

    for idx, row in ground_truth_df.iterrows():
        query = row['Processed_Query']
        relevant_ids = set(row['Relevant_Song_IDs'])

        artist, album, year = parse_query_for_graph(query, lyrics_df)

        tfidf_retrieved_docs = retrieve_with_tfidf(
            lyrics_df, query, ngram_range=(n, n), top_k=top_k
        )

        for doc in tfidf_retrieved_docs:
            doc_id = doc['id']

            if doc_id in lyrics_df['no'].values:
                lyric_row = lyrics_df.loc[lyrics_df['no'] == doc_id]
                artist_name = lyric_row['artist'].values[0] if not lyric_row['artist'].isnull().all() else None
                album_name = lyric_row['album'].values[0] if not lyric_row['album'].isnull().all() else None

                if artist_name:
                    doc['artist_score'] = get_artist_graph_score(artist_name, kg, query)
                else:
                    print(f"Warning: artist_name is invalid for doc_id {doc_id}")
                    doc['artist_score'] = 0

                if album_name:
                    doc['album_score'] = get_album_graph_score(album_name, kg, query)
                else:
                    print(f"Warning: album_name is invalid for doc_id {doc_id}")
                    doc['album_score'] = 0

                doc['combined_score'] = (
                    doc.get('tfidf_score', 0)
                    + doc.get('artist_score', 0)
                    + doc.get('album_score', 0)
                )
            else:
                print(f"Warning: doc_id {doc_id} not found in lyrics_df")

        combined_retrieved_docs = sorted(
            tfidf_retrieved_docs, key=lambda x: x['combined_score'], reverse=True
        )[:top_k]
        combined_retrieved_ids = [doc['id'] for doc in combined_retrieved_docs]

        evaluation = evaluate_query(relevant_ids, combined_retrieved_ids, k_values)
        evaluation['Query'] = row['Processed_Query']
        evaluation['N-gram'] = n
        evaluation_results.append(evaluation)

    eval_df = pd.DataFrame(evaluation_results)
    print(f"\nEvaluation Results for {n}-gram:")
    print(eval_df.head())

    macro_results = {
        "Metric": [
            "Precision@3", "Recall@3", "F1@3",
            "Precision@6", "Recall@6", "F1@6",
            "Precision@10", "Recall@10", "F1@10",
            "Average Precision"
        ],
        "Macro Average": [
            eval_df['Precision@3'].mean(),
            eval_df['Recall@3'].mean(),
            eval_df['F1@3'].mean(),
            eval_df['Precision@6'].mean(),
            eval_df['Recall@6'].mean(),
            eval_df['F1@6'].mean(),
            eval_df['Precision@10'].mean(),
            eval_df['Recall@10'].mean(),
            eval_df['F1@10'].mean(),
            eval_df['Average Precision'].mean()
        ]
    }
    macro_df = pd.DataFrame(macro_results)
    macro_df['N-gram'] = n
    print(f"\nMacro Averages for {n}-gram:")
    print(macro_df)

    eval_df.to_excel(f'evaluation_combined_complex_tfidf_graph_{n}gram.xlsx', index=False)
    macro_df.to_excel(f'macro_averages_combined_complex_graph_{n}gram.xlsx', index=False)

    all_ngram_evaluations.append(macro_df)

combined_macro = pd.concat(all_ngram_evaluations, ignore_index=True)
combined_macro.to_excel('combined_macro_averages_complex_all_ngrams_with_graph.xlsx', index=False)
print("\nAll macro averages for all n-grams have been saved to 'combined_macro_averages_all_ngrams_with_graph.xlsx'.")



Evaluation Results for 1-gram:
   Precision@3  Recall@3  F1@3  Precision@6  Recall@6      F1@6  Precision@10  \
0     0.000000       0.0   0.0     0.000000       0.0  0.000000           0.0   
1     0.333333       1.0   0.5     0.166667       1.0  0.285714           0.1   
2     0.000000       0.0   0.0     0.000000       0.0  0.000000           0.0   
3     0.000000       0.0   0.0     0.000000       0.0  0.000000           0.0   
4     0.000000       0.0   0.0     0.166667       1.0  0.285714           0.1   

   Recall@10     F1@10  Average Precision                       Query  N-gram  
0        0.0  0.000000           0.000000               adel love you       1  
1        1.0  0.181818           0.333333   billi eilish stick togeth       1  
2        0.0  0.000000           0.000000          bruno mar kiss you       1  
3        0.0  0.000000           0.000000  jennif lopez you feel left       1  
4        1.0  0.181818           0.166667             hopeless romant       1  



In [None]:
from collections import defaultdict

def rrf_fusion(ranked_lists, rrf_k=60):
    """
    Perform Reciprocal Rank Fusion (RRF) on multiple ranked lists.

    Parameters:
    - ranked_lists: List of lists, where each sublist contains document IDs ordered by relevance.
    - rrf_k: Constant to control the influence of rank.

    Returns:
    - fused_docs: List of document IDs ordered by their RRF scores.
    """
    rrf_scores = defaultdict(float)
    for ranked_list in ranked_lists:
        for rank, doc in enumerate(ranked_list):
            if isinstance(doc, dict):
                doc_id = doc.get('id')
            else:
                doc_id = doc
            rrf_scores[doc_id] += 1.0 / (rrf_k + rank + 1)
    fused_ranking = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    fused_docs = [doc_id for doc_id, score in fused_ranking]
    return fused_docs


rrf_evaluations = []

for idx, row in ground_truth_df.iterrows():
    query = row['Processed_Query']
    relevant_ids = set(row['Relevant_Song_IDs'])

    ranked_lists = []
    for n in range(1, 3):
        retrieved_docs = retrieve_with_tfidf(lyrics_df, query, ngram_range=(n,n), top_k=top_k)
        ranked_lists.append(retrieved_docs)

    fused_docs = rrf_fusion(ranked_lists, rrf_k=60)

    rrf_eval = evaluate_query(relevant_ids, fused_docs, k_values)
    rrf_eval['Query'] = row['query']
    rrf_evaluations.append(rrf_eval)

rrf_df = pd.DataFrame(rrf_evaluations)
print("\nRRF Fused Evaluation Results:")
print(rrf_df.head())

rrf_macro_results = {
    "Metric": [
        "Precision",
        "Recall",
        "F1",
        "Precision@3",
        "Precision@6",
        "Precision@10",
        "Average Precision"
    ],
    "Macro Average": [
        rrf_df['Precision@10'].mean(),
        rrf_df['Recall@10'].mean(),
        rrf_df['F1@10'].mean(),
        rrf_df['Precision@3'].mean(),
        rrf_df['Precision@6'].mean(),
        rrf_df['Precision@10'].mean(),
        rrf_df['Average Precision'].mean()
    ]
}
rrf_macro_df = pd.DataFrame(rrf_macro_results)
rrf_macro_df['N-gram'] = 'RRF Fusion'
print("\nMacro Averages for RRF Fusion:")
print(rrf_macro_df)

rrf_df.to_excel('evaluation_tfidf_rrf_fusion.xlsx', index=False)
rrf_macro_df.to_excel('macro_averages_rrf_fusion.xlsx', index=False)
print("\nRRF Fused evaluation results have been saved to 'evaluation_tfidf_rrf_fusion.xlsx'.")
print("Macro averages for RRF Fusion have been saved to 'macro_averages_rrf_fusion.xlsx'.")



RRF Fused Evaluation Results:
   Precision@3  Recall@3  F1@3  Precision@6  Recall@6      F1@6  Precision@10  \
0     0.000000       0.0   0.0     0.000000       0.0  0.000000           0.0   
1     0.333333       1.0   0.5     0.166667       1.0  0.285714           0.1   
2     0.000000       0.0   0.0     0.000000       0.0  0.000000           0.0   
3     0.000000       0.0   0.0     0.000000       0.0  0.000000           0.0   
4     0.333333       1.0   0.5     0.166667       1.0  0.285714           0.1   

   Recall@10     F1@10  Average Precision  \
0        0.0  0.000000                0.0   
1        1.0  0.181818                0.5   
2        0.0  0.000000                0.0   
3        0.0  0.000000                0.0   
4        1.0  0.181818                0.5   

                                               Query  
0               Adele's song with lyrics love you in  
1    Billie Eilish's song with lyrics stick together  
2              Bruno Mars' song with lyrics ki