In [None]:
import sys
import itertools
from pathlib import Path
from joblib import Parallel, delayed
sys.path.append("..\..")

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_curve, auc

from src.data.datasets.overlaps import OverlapsDataset
from src.data.matchers.smith_waterman import SmithWatermanMelodyMatcher
from src.data.structures.melody import Melody

In [2]:
def calculate_levenshtein_distances(pairs: list[tuple[Path, Path]], params: dict) -> list[float]:

    def process_pair(pair: tuple[Path, Path]) -> float:
        midi1_path, midi2_path = pair
        melody1 = Melody.from_midi(midi1_path)
        melody2 = Melody.from_midi(midi2_path)
        
        matcher = SmithWatermanMelodyMatcher(melody1, melody2)
        matcher.find_patterns(**params)
        distance = matcher.overlaps_levenshtein_distance()

        return distance
    
    distances = Parallel(n_jobs=-1, backend='loky')(
        delayed(process_pair)(pair) for pair in pairs
    )
    
    return distances

In [3]:
def get_roc_curve(overlaping_distances: list[float], non_overlaping_distances: list[float]) -> tuple[np.ndarray, np.ndarray]:

    y_true = np.concatenate([
        np.ones(len(overlaping_distances)),
        np.zeros(len(non_overlaping_distances))
    ])

    y_scores = -np.concatenate([
        overlaping_distances,
        non_overlaping_distances
    ])

    fpr, tpr, _ = roc_curve(y_true, y_scores)  

    return fpr, tpr

In [4]:
def evaluate_params(params: dict, dataset: OverlapsDataset) -> dict:
    """Оценка набора параметров.
    
    :param dict params: Параметры для оценки
    :param OverlapsDataset dataset: Датасет
    :return dict: Результаты с параметрами и метриками
    """
    try:
        overlaping_distances = calculate_levenshtein_distances(dataset.overlapping_pairs, params)
        non_overlaping_distances = calculate_levenshtein_distances(dataset.non_overlapping_pairs, params) 

        fpr, tpr = get_roc_curve(overlaping_distances, non_overlaping_distances)
        auc_val = auc(fpr, tpr)

        return {
            **params, 
            'auc': auc_val
        }
    
    except Exception as e:
        print(f"Error with params {params}: {str(e)}")
        return {
            **params, 
            'auc': float('nan')
        }


In [5]:
def grid_search(
    dataset: OverlapsDataset,
    param_combinations: list[dict],
) -> pd.DataFrame:
    
    results = []

    for params in tqdm(param_combinations, desc="Grid Search"):
        results.append(evaluate_params(params, dataset))
        pd.DataFrame(results).to_csv('grid_search_checkpoint1.csv', index=False)

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('auc', ascending=False)
        
    return results_df

In [None]:
param_grid = {  
    'threshold_ratio': [0.3, 0.5, 0.7, 0.9],
    'min_length': [7, 9, 11],
    'gap_penalty': [-3, -2, -1, 0],
    'mismatch_score': [-3, -2, -1, 0],
    'match_score': [2, 3, 1],
    'tolerance': [0, 1, 2],
}


In [7]:
param_combinations = [
    dict(zip(param_grid.keys(), values))
    for values in itertools.product(*param_grid.values())
]

In [8]:
dataset = OverlapsDataset.from_path("../../datasets/PlagiNet/", preprocess_data=False)

In [None]:
grid_search(dataset, param_combinations)