# BM25L

## Simple

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict
import math

warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('punkt_tab')

lyrics_df = pd.read_excel('/content/databersih.xlsx')
ground_truth_df = pd.read_excel('/content/GT UAS NLP_simple.xlsx')

stemmer = PorterStemmer()

remove_words = ["song", "with", "lyrics", "from", "the", "album", "released",
               "in", "before", "after", "since", "s"]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def clean_lyrics(text):
    if isinstance(text, str):
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'\(.*?\)', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word) for word in tokens if word not in remove_words]
        text = ' '.join(tokens)
    return text

lyrics_df['Processed_Lyrics'] = lyrics_df['lyrics'].apply(clean_lyrics)

print("\nCleaned Lyrics Sample:")
print(lyrics_df[['title', 'Processed_Lyrics']].head())

def parse_song_ids(song_id_entry):
    if pd.isnull(song_id_entry):
        return []
    if isinstance(song_id_entry, int):
        return [song_id_entry]
    if isinstance(song_id_entry, float) and np.isnan(song_id_entry):
        return []
    song_id_str = str(song_id_entry)
    return [int(id_.strip()) for id_ in song_id_str.split(',') if id_.strip().isdigit()]

ground_truth_df['Relevant_Song_IDs'] = ground_truth_df['song_id'].apply(parse_song_ids)
ground_truth_df = ground_truth_df[ground_truth_df['Relevant_Song_IDs'].map(len) > 0].reset_index(drop=True)
print("\nParsed Ground Truth:")
print(ground_truth_df[['query', 'Relevant_Song_IDs', 'total']].head())


Cleaned Lyrics Sample:
                   title                                   Processed_Lyrics
0      Chasing Pavements  ive made up my mind dont need to think it over...
1          Cold Shoulder  you say it all my head and thing i think just ...
2         Hometown Glory  ive been walk same way as i did miss out crack...
3  Make You Feel My Love  when rain is blow your face and whole world is...
4                My Same  aye aye ayeay aye aye ayeay aye aye ayeay aye ...

Parsed Ground Truth:
           query                                  Relevant_Song_IDs  total
0    love you in          [129, 1433, 1977, 2978, 3214, 3320, 4514]      7
1   home tonight  [561, 1019, 1455, 1671, 1686, 2428, 2434, 3126...     12
2  something new  [20, 484, 671, 783, 999, 1463, 2556, 2607, 264...     18
3     i held you                             [44, 1520, 4082, 4503]      4
4  stars tonight                                   [95, 2142, 3177]      3


### BM25 Implementation


In [None]:
class BM25:
    def __init__(self, corpus, k1=1.5, b=0.75):
        self.corpus = corpus
        self.k1 = k1
        self.b = b
        self.avgdl = sum(len(doc.split()) for doc in corpus) / len(corpus)
        self.doc_len = [len(doc.split()) for doc in corpus]
        self.term_freq = []
        self.doc_freq = defaultdict(int)
        self.idf = {}
        self._initialize()

    def _initialize(self):
        for doc in self.corpus:
            tf = {}
            words = doc.split()
            for w in words:
                tf[w] = tf.get(w, 0) + 1
            self.term_freq.append(tf)
            for word in set(words):
                self.doc_freq[word] += 1

        N = len(self.corpus)
        for word, df in self.doc_freq.items():
            self.idf[word] = math.log((N - df + 0.5) / (df + 0.5) + 1)

    def score(self, query, doc_index):
        score = 0.0
        words = query.split()
        tf = self.term_freq[doc_index]
        dl = self.doc_len[doc_index]
        for w in words:
            if w in self.idf:
                f = tf.get(w, 0)
                denom = f + self.k1 * (1 - self.b + self.b * (dl / self.avgdl))
                score += self.idf[w] * ((f * (self.k1 + 1)) / denom)
        return score

    def get_scores(self, query):
        scores = []
        for i in range(len(self.corpus)):
            scores.append(self.score(query, i))
        return scores


### Preprocessing Queries


In [None]:
def preprocess_query(query, lyrics_df):
    if isinstance(query, str):
        artist_list = lyrics_df['artist'].str.lower().unique()
        album_list = lyrics_df['album'].str.lower().unique()
        year_list = lyrics_df['year'].unique()
        for artist in artist_list:
            query = re.sub(r'\b' + re.escape(artist) + r'\b', '', query, flags=re.IGNORECASE)
        for album in album_list:
            query = re.sub(r'\b' + re.escape(album) + r'\b', '', query, flags=re.IGNORECASE)
        for year in year_list:
            query = re.sub(r'\b' + re.escape(str(year)) + r'\b', '', query)

        query = re.sub(r'[^\w\s]', '', query)
        query = re.sub(r'\d+', '', query)
        query = query.lower()
        tokens = word_tokenize(query)
        tokens = [stemmer.stem(word) for word in tokens if word not in remove_words]
        return ' '.join(tokens)
    return query

ground_truth_df['Processed_Query'] = ground_truth_df['query'].apply(lambda q: preprocess_query(q, lyrics_df))


### Helper Functions for N-grams and Evaluation


In [None]:
def generate_ngrams(query, n):
    words = query.split()
    return [' '.join(words[i:i + n]) for i in range(len(words) - n + 1)]

def precision_at_k(y_true, y_pred, k):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    sorted_indices = np.argsort(y_pred)[::-1]
    top_k_indices = sorted_indices[:k]
    top_k_truth = y_true[top_k_indices]
    return np.sum(top_k_truth) / k

def average_precision_func(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[sorted_indices]

    relevant = 0
    precision_sum = 0
    for i, val in enumerate(y_true_sorted):
        if val == 1:
            relevant += 1
            precision_sum += relevant / (i + 1)
    return precision_sum / np.sum(y_true) if np.sum(y_true) > 0 else 0

### RRF Fusion Function


In [None]:
def rrf_fusion(ranked_lists, rrf_k=60):
    """
    Perform Reciprocal Rank Fusion (RRF) on multiple ranked lists.

    Parameters:
    - ranked_lists: List of lists, where each sublist contains document IDs ordered by relevance.
    - rrf_k: Constant to control the influence of rank.

    Returns:
    - fused_docs: List of document IDs ordered by their RRF scores.
    """
    rrf_scores = defaultdict(float)
    for ranked_list in ranked_lists:
        for rank, doc_id in enumerate(ranked_list):
            rrf_scores[doc_id] += 1.0 / (rrf_k + rank + 1)
    fused_ranking = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    fused_docs = [doc_id for doc_id, score in fused_ranking]
    return fused_docs


### Main Retrieval and Evaluation


In [None]:
corpus = lyrics_df['Processed_Lyrics'].tolist()
bm25 = BM25(corpus)

song_id_to_index = {song_id: idx for idx, song_id in enumerate(lyrics_df['id'])}

all_ngrams_results = []

query_retrievals = defaultdict(lambda: defaultdict(list))

for n in range(1, 6):
    print(f"Evaluating {n}-gram approach...")
    evaluation_results = []

    for _, row in ground_truth_df.iterrows():
        query = row['Processed_Query']
        relevant_ids = row['Relevant_Song_IDs']

        ngrams = generate_ngrams(query, n)
        if not ngrams:
            evaluation_results.append({
                'Query': row['query'],
                'N-gram': n,
                'Precision': 0.0,
                'Recall': 0.0,
                'F1-Score': 0.0,
                'Precision@3': 0.0,
                'Precision@6': 0.0,
                'Precision@10': 0.0,
                'Average Precision': 0.0
            })
            continue

        doc_scores = np.zeros(len(corpus))
        for ng in ngrams:
            scores = bm25.get_scores(ng)
            doc_scores += np.array(scores)

        y_true = [1 if lyrics_df.iloc[i]['id'] in relevant_ids else 0 for i in range(len(corpus))]
        y_pred = doc_scores

        threshold = np.median(y_pred)
        binary_pred = y_pred >= threshold

        precision = precision_score(y_true, binary_pred, zero_division=0)
        recall = recall_score(y_true, binary_pred, zero_division=0)
        f1 = f1_score(y_true, binary_pred, zero_division=0)
        p_at_3 = precision_at_k(y_true, y_pred, k=3)
        p_at_6 = precision_at_k(y_true, y_pred, k=6)
        p_at_10 = precision_at_k(y_true, y_pred, k=10)
        ap = average_precision_func(y_true, y_pred)

        evaluation_results.append({
            'Query': row['query'],
            'N-gram': n,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'Precision@3': p_at_3,
            'Precision@6': p_at_6,
            'Precision@10': p_at_10,
            'Average Precision': ap
        })

        sorted_doc_indices = np.argsort(doc_scores)[-top_k:][::-1]
        retrieved_docs = lyrics_df.iloc[sorted_doc_indices]['id'].tolist()
        query_retrievals[row['query']][n] = retrieved_docs

    evaluation_df = pd.DataFrame(evaluation_results)
    print(evaluation_df.head())

    macro_results = {
        "Metric": [
            "Precision",
            "Recall",
            "F1-Score",
            "Precision@3",
            "Precision@6",
            "Precision@10",
            "Average Precision"
        ],
        "Macro Average": [
            evaluation_df['Precision'].mean(),
            evaluation_df['Recall'].mean(),
            evaluation_df['F1-Score'].mean(),
            evaluation_df['Precision@3'].mean(),
            evaluation_df['Precision@6'].mean(),
            evaluation_df['Precision@10'].mean(),
            evaluation_df['Average Precision'].mean()
        ]
    }
    macro_df = pd.DataFrame(macro_results)
    macro_df['N-gram'] = n
    print(f"\nMacro Averages for {n}-grams:")
    print(macro_df)

    evaluation_df.to_excel(f'evaluation_bm25_{n}gram.xlsx', index=False)
    macro_df.to_excel(f'macro_averages_{n}gram.xlsx', index=False)
    print(f"\nSaved evaluation_bm25_{n}gram.xlsx and macro_averages_{n}gram.xlsx")

    macro_df['N-gram'] = n
    all_ngrams_results.append(macro_df)

combined_macro = pd.concat(all_ngrams_results, ignore_index=True)
combined_macro.to_excel('combined_macro_averages_all_ngrams.xlsx', index=False)
print("\nAll combined macro averages have been saved to 'combined_macro_averages_all_ngrams.xlsx'.")


Evaluating 1-gram approach...
           Query  N-gram  Precision    Recall  F1-Score  Precision@3  \
0    love you in       1   0.002036  0.714286  0.004060     0.000000   
1   home tonight       1   0.002443  1.000000  0.004874     0.000000   
2  something new       1   0.003664  1.000000  0.007302     0.666667   
3     i held you       1   0.001629  1.000000  0.003252     0.333333   
4  stars tonight       1   0.000611  1.000000  0.001221     0.333333   

   Precision@6  Precision@10  Average Precision  
0     0.000000           0.0           0.003241  
1     0.166667           0.3           0.162615  
2     0.500000           0.4           0.327808  
3     0.166667           0.1           0.166020  
4     0.500000           0.3           0.666667  

Macro Averages for 1-grams:
              Metric  Macro Average  N-gram
0          Precision       0.001873       1
1             Recall       0.988571       1
2           F1-Score       0.003737       1
3        Precision@3       0.333

In [None]:
print("\nStarting Reciprocal Rank Fusion (RRF) Evaluation...")

rrf_evaluations = []

for _, row in ground_truth_df.iterrows():
    query = row['Processed_Query']
    original_query = row['query']
    relevant_ids = set(row['Relevant_Song_IDs'])

    ranked_lists = []
    for n in range(1, 6):
        retrieved_docs = query_retrievals[original_query].get(n, [])
        ranked_lists.append(retrieved_docs)

    fused_docs = rrf_fusion(ranked_lists, rrf_k=60)

    y_true = [1 if lyrics_df.iloc[i]['id'] in relevant_ids else 0 for i in range(len(corpus))]
    y_pred = [1 if doc_id in fused_docs[:top_k] else 0 for doc_id in lyrics_df['id']]

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)


    p_at_3 = precision_at_k(y_true, [lyrics_df.iloc[song_id_to_index[doc_id]]['id'] for doc_id in fused_docs], k=3)
    p_at_6 = precision_at_k(y_true, [lyrics_df.iloc[song_id_to_index[doc_id]]['id'] for doc_id in fused_docs], k=6)
    p_at_10 = precision_at_k(y_true, [lyrics_df.iloc[song_id_to_index[doc_id]]['id'] for doc_id in fused_docs], k=10)

    fused_scores = {doc_id: score for score, doc_id in enumerate(fused_docs, start=1)}
    ap = average_precision_func(
        [1 if doc_id in relevant_ids else 0 for doc_id in lyrics_df['id']],
        [fused_scores.get(doc_id, 0) for doc_id in lyrics_df['id']]
    )

    rrf_evaluations.append({
        'Query': original_query,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'Precision@3': p_at_3,
        'Precision@6': p_at_6,
        'Precision@10': p_at_10,
        'Average Precision': ap
    })

rrf_df = pd.DataFrame(rrf_evaluations)
print("\nRRF Fused Evaluation Results:")
print(rrf_df.head())

rrf_macro_results = {
    "Metric": [
        "Precision",
        "Recall",
        "F1",
        "Precision@3",
        "Precision@6",
        "Precision@10",
        "Average Precision"
    ],
    "Macro Average": [
        rrf_df['Precision'].mean(),
        rrf_df['Recall'].mean(),
        rrf_df['F1'].mean(),
        rrf_df['Precision@3'].mean(),
        rrf_df['Precision@6'].mean(),
        rrf_df['Precision@10'].mean(),
        rrf_df['Average Precision'].mean()
    ]
}
rrf_macro_df = pd.DataFrame(rrf_macro_results)
rrf_macro_df['N-gram'] = 'RRF Fusion'
print("\nMacro Averages for RRF Fusion:")
print(rrf_macro_df)

rrf_df.to_excel('evaluation_bm25_rrf_fusion.xlsx', index=False)
rrf_macro_df.to_excel('macro_averages_rrf_fusion.xlsx', index=False)
print("\nRRF Fused evaluation results have been saved to 'evaluation_bm25_rrf_fusion.xlsx'.")
print("Macro averages for RRF Fusion have been saved to 'macro_averages_rrf_fusion.xlsx'.")



Starting Reciprocal Rank Fusion (RRF) Evaluation...

RRF Fused Evaluation Results:
           Query  Precision    Recall        F1  Precision@3  Precision@6  \
0    love you in        0.0  0.000000  0.000000          0.0          0.0   
1   home tonight        0.3  0.250000  0.272727          0.0          0.0   
2  something new        0.4  0.222222  0.285714          0.0          0.0   
3     i held you        0.1  0.250000  0.142857          0.0          0.0   
4  stars tonight        0.3  1.000000  0.461538          0.0          0.0   

   Precision@10  Average Precision  
0           0.0           0.001525  
1           0.0           0.215311  
2           0.0           0.075769  
3           0.0           0.032119  
4           0.0           0.261905  

Macro Averages for RRF Fusion:
              Metric  Macro Average      N-gram
0          Precision       0.168000  RRF Fusion
1             Recall       0.344270  RRF Fusion
2                 F1       0.207371  RRF Fusion
3      

## Complex

In [None]:
import pandas as pd
import numpy as np
import re
import warnings
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import defaultdict
import math

warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('punkt_tab')

lyrics_df = pd.read_excel('/content/databersih.xlsx')
ground_truth_df = pd.read_excel('/content/GT UAS NLP_complex.xlsx')

stemmer = PorterStemmer()

remove_words = ["song", "with", "lyrics", "from", "the", "album", "released",
               "in", "before", "after", "since", "s"]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def clean_lyrics(text):
    if isinstance(text, str):
        text = re.sub(r'\[.*?\]', '', text)
        text = re.sub(r'\(.*?\)', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [stemmer.stem(word) for word in tokens if word not in remove_words]
        text = ' '.join(tokens)
    return text

lyrics_df['Processed_Lyrics'] = lyrics_df['lyrics'].apply(clean_lyrics)

print("\nCleaned Lyrics Sample:")
print(lyrics_df[['title', 'Processed_Lyrics']].head())

def parse_song_ids(song_id_entry):
    if pd.isnull(song_id_entry):
        return []
    if isinstance(song_id_entry, int):
        return [song_id_entry]
    if isinstance(song_id_entry, float) and np.isnan(song_id_entry):
        return []
    song_id_str = str(song_id_entry)
    return [int(id_.strip()) for id_ in song_id_str.split(',') if id_.strip().isdigit()]

ground_truth_df['Relevant_Song_IDs'] = ground_truth_df['song_id'].apply(parse_song_ids)
ground_truth_df = ground_truth_df[ground_truth_df['Relevant_Song_IDs'].map(len) > 0].reset_index(drop=True)
print("\nParsed Ground Truth:")
print(ground_truth_df[['query', 'Relevant_Song_IDs', 'total']].head())


Cleaned Lyrics Sample:
                   title                                   Processed_Lyrics
0      Chasing Pavements  ive made up my mind dont need to think it over...
1          Cold Shoulder  you say it all my head and thing i think just ...
2         Hometown Glory  ive been walk same way as i did miss out crack...
3  Make You Feel My Love  when rain is blow your face and whole world is...
4                My Same  aye aye ayeay aye aye ayeay aye aye ayeay aye ...

Parsed Ground Truth:
                                               query Relevant_Song_IDs  total
0               Adele's song with lyrics love you in            [1433]      1
1    Billie Eilish's song with lyrics stick together             [227]      1
2              Bruno Mars' song with lyrics kiss you            [2129]      1
3    Jennifer Lopez's song with lyrics you feel left             [788]      1
4  Song released in 2024 with lyrics hopeless rom...            [3019]      1


### BM25 Implementation


In [None]:
class BM25:
    def __init__(self, corpus, k1=1.5, b=0.75):
        self.corpus = corpus
        self.k1 = k1
        self.b = b
        self.avgdl = sum(len(doc.split()) for doc in corpus) / len(corpus)
        self.doc_len = [len(doc.split()) for doc in corpus]
        self.term_freq = []
        self.doc_freq = defaultdict(int)
        self.idf = {}
        self._initialize()

    def _initialize(self):
        for doc in self.corpus:
            tf = {}
            words = doc.split()
            for w in words:
                tf[w] = tf.get(w, 0) + 1
            self.term_freq.append(tf)
            for word in set(words):
                self.doc_freq[word] += 1

        N = len(self.corpus)
        for word, df in self.doc_freq.items():
            self.idf[word] = math.log((N - df + 0.5) / (df + 0.5) + 1)

    def score(self, query, doc_index):
        score = 0.0
        words = query.split()
        tf = self.term_freq[doc_index]
        dl = self.doc_len[doc_index]
        for w in words:
            if w in self.idf:
                f = tf.get(w, 0)
                denom = f + self.k1 * (1 - self.b + self.b * (dl / self.avgdl))
                score += self.idf[w] * ((f * (self.k1 + 1)) / denom)
        return score

    def get_scores(self, query):
        scores = []
        for i in range(len(self.corpus)):
            scores.append(self.score(query, i))
        return scores


### Preprocessing Queries


In [None]:
def preprocess_query(query, lyrics_df):
    if isinstance(query, str):
        artist_list = lyrics_df['artist'].str.lower().unique()
        album_list = lyrics_df['album'].str.lower().unique()
        year_list = lyrics_df['year'].unique()
        for artist in artist_list:
            query = re.sub(r'\b' + re.escape(artist) + r'\b', '', query, flags=re.IGNORECASE)
        for album in album_list:
            query = re.sub(r'\b' + re.escape(album) + r'\b', '', query, flags=re.IGNORECASE)
        for year in year_list:
            query = re.sub(r'\b' + re.escape(str(year)) + r'\b', '', query)

        query = re.sub(r'[^\w\s]', '', query)
        query = re.sub(r'\d+', '', query)
        query = query.lower()
        tokens = word_tokenize(query)
        tokens = [stemmer.stem(word) for word in tokens if word not in remove_words]
        return ' '.join(tokens)
    return query

ground_truth_df['Processed_Query'] = ground_truth_df['query'].apply(lambda q: preprocess_query(q, lyrics_df))


### Helper Functions for N-grams and Evaluation


In [None]:
def generate_ngrams(query, n):
    words = query.split()
    return [' '.join(words[i:i + n]) for i in range(len(words) - n + 1)]

def precision_at_k(y_true, y_pred, k):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    sorted_indices = np.argsort(y_pred)[::-1]
    top_k_indices = sorted_indices[:k]
    top_k_truth = y_true[top_k_indices]
    return np.sum(top_k_truth) / k

def average_precision_func(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    sorted_indices = np.argsort(y_pred)[::-1]
    y_true_sorted = y_true[sorted_indices]

    relevant = 0
    precision_sum = 0
    for i, val in enumerate(y_true_sorted):
        if val == 1:
            relevant += 1
            precision_sum += relevant / (i + 1)
    return precision_sum / np.sum(y_true) if np.sum(y_true) > 0 else 0

### RRF Fusion Function


In [None]:
def rrf_fusion(ranked_lists, rrf_k=60):
    """
    Perform Reciprocal Rank Fusion (RRF) on multiple ranked lists.

    Parameters:
    - ranked_lists: List of lists, where each sublist contains document IDs ordered by relevance.
    - rrf_k: Constant to control the influence of rank.

    Returns:
    - fused_docs: List of document IDs ordered by their RRF scores.
    """
    rrf_scores = defaultdict(float)
    for ranked_list in ranked_lists:
        for rank, doc_id in enumerate(ranked_list):
            rrf_scores[doc_id] += 1.0 / (rrf_k + rank + 1)
    fused_ranking = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    fused_docs = [doc_id for doc_id, score in fused_ranking]
    return fused_docs


### Main Retrieval and Evaluation


In [None]:
corpus = lyrics_df['Processed_Lyrics'].tolist()
bm25 = BM25(corpus)

song_id_to_index = {song_id: idx for idx, song_id in enumerate(lyrics_df['id'])}

all_ngrams_results = []

query_retrievals = defaultdict(lambda: defaultdict(list))

for n in range(1, 6):
    print(f"Evaluating {n}-gram approach...")
    evaluation_results = []

    for _, row in ground_truth_df.iterrows():
        query = row['Processed_Query']

        ngrams = generate_ngrams(query, n)
        if not ngrams:
            evaluation_results.append({
                'Query': row['query'],
                'N-gram': n,
                'Precision': 0.0,
                'Recall': 0.0,
                'F1-Score': 0.0,
                'Precision@3': 0.0,
                'Precision@6': 0.0,
                'Precision@10': 0.0,
                'Average Precision': 0.0
            })
            continue

        doc_scores = np.zeros(len(corpus))
        for ng in ngrams:
            scores = bm25.get_scores(ng)
            doc_scores += np.array(scores)

        y_true = [1 if lyrics_df.iloc[i]['id'] in relevant_ids else 0 for i in range(len(corpus))]
        y_pred = doc_scores

        threshold = np.median(y_pred)
        binary_pred = y_pred >= threshold

        precision = precision_score(y_true, binary_pred, zero_division=0)
        recall = recall_score(y_true, binary_pred, zero_division=0)
        f1 = f1_score(y_true, binary_pred, zero_division=0)
        p_at_3 = precision_at_k(y_true, y_pred, k=3)
        p_at_6 = precision_at_k(y_true, y_pred, k=6)
        p_at_10 = precision_at_k(y_true, y_pred, k=10)
        ap = average_precision_func(y_true, y_pred)

        evaluation_results.append({
            'Query': row['query'],
            'N-gram': n,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'Precision@3': p_at_3,
            'Precision@6': p_at_6,
            'Precision@10': p_at_10,
            'Average Precision': ap
        })

        sorted_doc_indices = np.argsort(doc_scores)[-top_k:][::-1]
        retrieved_docs = lyrics_df.iloc[sorted_doc_indices]['id'].tolist()
        query_retrievals[row['query']][n] = retrieved_docs

    evaluation_df = pd.DataFrame(evaluation_results)
    print(evaluation_df.head())

    macro_results = {
        "Metric": [
            "Precision",
            "Recall",
            "F1-Score",
            "Precision@3",
            "Precision@6",
            "Precision@10",
            "Average Precision"
        ],
        "Macro Average": [
            evaluation_df['Precision'].mean(),
            evaluation_df['Recall'].mean(),
            evaluation_df['F1-Score'].mean(),
            evaluation_df['Precision@3'].mean(),
            evaluation_df['Precision@6'].mean(),
            evaluation_df['Precision@10'].mean(),
            evaluation_df['Average Precision'].mean()
        ]
    }
    macro_df = pd.DataFrame(macro_results)
    macro_df['N-gram'] = n
    print(f"\nMacro Averages for {n}-grams:")
    print(macro_df)

    evaluation_df.to_excel(f'evaluation_bm25_{n}gram.xlsx', index=False)
    macro_df.to_excel(f'macro_averages_{n}gram.xlsx', index=False)
    print(f"\nSaved evaluation_bm25_{n}gram.xlsx and macro_averages_{n}gram.xlsx")

    macro_df['N-gram'] = n
    all_ngrams_results.append(macro_df)

combined_macro = pd.concat(all_ngrams_results, ignore_index=True)
combined_macro.to_excel('combined_macro_averages_all_ngrams.xlsx', index=False)
print("\nAll combined macro averages have been saved to 'combined_macro_averages_all_ngrams.xlsx'.")


Evaluating 1-gram approach...
                                               Query  N-gram  Precision  \
0               Adele's song with lyrics love you in       1   0.000407   
1    Billie Eilish's song with lyrics stick together       1   0.000204   
2              Bruno Mars' song with lyrics kiss you       1   0.000407   
3    Jennifer Lopez's song with lyrics you feel left       1   0.000407   
4  Song released in 2024 with lyrics hopeless rom...       1   0.000204   

   Recall  F1-Score  Precision@3  Precision@6  Precision@10  Average Precision  
0     1.0  0.000814     0.000000     0.000000           0.0           0.000602  
1     1.0  0.000407     0.000000     0.166667           0.1           0.250000  
2     1.0  0.000814     0.000000     0.000000           0.0           0.029412  
3     1.0  0.000814     0.000000     0.000000           0.0           0.006289  
4     1.0  0.000407     0.333333     0.166667           0.1           0.333333  

Macro Averages for 1-grams:
    

In [None]:
print("\nStarting Reciprocal Rank Fusion (RRF) Evaluation...")

rrf_evaluations = []

for _, row in ground_truth_df.iterrows():
    query = row['Processed_Query']
    original_query = row['query']
    relevant_ids = set(row['Relevant_Song_IDs'])

    ranked_lists = []
    for n in range(1, 6):
        retrieved_docs = query_retrievals[original_query].get(n, [])
        ranked_lists.append(retrieved_docs)

    fused_docs = rrf_fusion(ranked_lists, rrf_k=60)

    y_true = [1 if lyrics_df.iloc[i]['id'] in relevant_ids else 0 for i in range(len(corpus))]
    y_pred = [1 if doc_id in fused_docs[:top_k] else 0 for doc_id in lyrics_df['id']]

    precision = precision_score(y_true, y_pred, zero_division=0)

    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    p_at_3 = precision_at_k(y_true, [lyrics_df.iloc[song_id_to_index[doc_id]]['id'] for doc_id in fused_docs], k=3)
    p_at_6 = precision_at_k(y_true, [lyrics_df.iloc[song_id_to_index[doc_id]]['id'] for doc_id in fused_docs], k=6)
    p_at_10 = precision_at_k(y_true, [lyrics_df.iloc[song_id_to_index[doc_id]]['id'] for doc_id in fused_docs], k=10)

    fused_scores = {doc_id: score for score, doc_id in enumerate(fused_docs, start=1)}
    ap = average_precision_func(
        [1 if doc_id in relevant_ids else 0 for doc_id in lyrics_df['id']],
        [fused_scores.get(doc_id, 0) for doc_id in lyrics_df['id']]
    )

    rrf_evaluations.append({
        'Query': original_query,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'Precision@3': p_at_3,
        'Precision@6': p_at_6,
        'Precision@10': p_at_10,
        'Average Precision': ap
    })

rrf_df = pd.DataFrame(rrf_evaluations)
print("\nRRF Fused Evaluation Results:")
print(rrf_df.head())

rrf_macro_results = {
    "Metric": [
        "Precision",
        "Recall",
        "F1",
        "Precision@3",
        "Precision@6",
        "Precision@10",
        "Average Precision"
    ],
    "Macro Average": [
        rrf_df['Precision'].mean(),
        rrf_df['Recall'].mean(),
        rrf_df['F1'].mean(),
        rrf_df['Precision@3'].mean(),
        rrf_df['Precision@6'].mean(),
        rrf_df['Precision@10'].mean(),
        rrf_df['Average Precision'].mean()
    ]
}
rrf_macro_df = pd.DataFrame(rrf_macro_results)
rrf_macro_df['N-gram'] = 'RRF Fusion'
print("\nMacro Averages for RRF Fusion:")
print(rrf_macro_df)

rrf_df.to_excel('evaluation_bm25_rrf_fusion.xlsx', index=False)
rrf_macro_df.to_excel('macro_averages_rrf_fusion.xlsx', index=False)
print("\nRRF Fused evaluation results have been saved to 'evaluation_bm25_rrf_fusion.xlsx'.")
print("Macro averages for RRF Fusion have been saved to 'macro_averages_rrf_fusion.xlsx'.")



Starting Reciprocal Rank Fusion (RRF) Evaluation...

RRF Fused Evaluation Results:
                                               Query  Precision  Recall  \
0               Adele's song with lyrics love you in        0.0     0.0   
1    Billie Eilish's song with lyrics stick together        0.1     1.0   
2              Bruno Mars' song with lyrics kiss you        0.0     0.0   
3    Jennifer Lopez's song with lyrics you feel left        0.0     0.0   
4  Song released in 2024 with lyrics hopeless rom...        0.1     1.0   

         F1  Precision@3  Precision@6  Precision@10  Average Precision  
0  0.000000          0.0          0.0           0.0           0.001639  
1  0.181818          0.0          0.0           0.0           0.142857  
2  0.000000          0.0          0.0           0.0           0.000880  
3  0.000000          0.0          0.0           0.0           0.000419  
4  0.181818          0.0          0.0           0.0           0.125000  

Macro Averages for RRF Fus