In [None]:
# Templates de Code MySpotify - Prêts à l'emploi

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import logging

# Configuration logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataLoader:
    """Chargeur de données optimisé pour MSD"""
    
    def __init__(self, data_path):
        self.data_path = data_path
    
    def load_triplets(self, sample_frac=None):
        """Charge les triplets avec option d'échantillonnage"""
        logger.info("Chargement des triplets...")
        
        # Chargement par chunks pour gros fichiers
        chunks = []
        chunksize = 100000
        
        for chunk in pd.read_csv(
            f"{self.data_path}/train_triplets.txt", 
            sep='\t', 
            names=['user_id', 'song_id', 'play_count'],
            chunksize=chunksize
        ):
            if sample_frac:
                chunk = chunk.sample(frac=sample_frac)
            chunks.append(chunk)
        
        df = pd.concat(chunks, ignore_index=True)
        logger.info(f"Triplets chargés: {len(df):,} interactions")
        return df
    
    def load_tracks(self):
        """Charge le mapping tracks"""
        logger.info("Chargement du mapping tracks...")
        
        df = pd.read_csv(
            f"{self.data_path}/p02_unique_tracks.txt",
            sep='<SEP>',
            names=['track_id', 'song_id', 'artist', 'title'],
            engine='python'
        )
        logger.info(f"Tracks chargés: {len(df):,}")
        return df
    
    def load_lyrics(self):
        """Charge les données de paroles musiXmatch"""
        logger.info("Chargement des paroles...")
        
        lyrics_data = []
        word_mapping = {}
        
        with open(f"{self.data_path}/mxm_dataset_train.txt", 'r') as f:
            for line_num, line in enumerate(f):
                line = line.strip()
                
                # Skip comments
                if line.startswith('#'):
                    continue
                
                # Parse word mapping
                if line.startswith('%'):
                    words = line[1:].split(',')
                    word_mapping = {i+1: word for i, word in enumerate(words)}
                    continue
                
                # Parse song data
                parts = line.split(',')
                if len(parts) < 2:
                    continue
                
                track_id = parts[0]
                mxm_track_id = parts[1]
                
                # Parse word counts (sparse format)
                word_counts = {}
                for part in parts[2:]:
                    if ':' in part:
                        try:
                            word_idx, count = part.split(':')
                            word_counts[int(word_idx)] = int(count)
                        except ValueError:
                            continue
                
                lyrics_data.append({
                    'track_id': track_id,
                    'mxm_track_id': mxm_track_id,
                    'word_counts': word_counts
                })
        
        df = pd.DataFrame(lyrics_data)
        logger.info(f"Paroles chargées: {len(df):,} tracks")
        return df, word_mapping
    
    def load_genres(self):
        """Charge les annotations de genres"""
        logger.info("Chargement des genres...")
        
        df = pd.read_csv(
            f"{self.data_path}/p02_msd_tagtraum_cd2.cls",
            sep='\t',
            names=['track_id', 'majority_genre', 'minority_genre'],
            comment='#'
        )
        logger.info(f"Genres chargés: {len(df):,}")
        return df

class QuickStartRecommender:
    """Version simplifiée pour démarrage rapide"""
    
    def __init__(self, data_path):
        self.data_loader = DataLoader(data_path)
        self.triplets_df = None
        self.tracks_df = None
        self.user_to_idx = {}
        self.song_to_idx = {}
        self.user_item_matrix = None
        
    def quick_setup(self, sample_frac=0.1):
        """Setup rapide avec échantillon de données"""
        logger.info(f"Setup rapide avec {sample_frac*100}% des données")
        
        # Chargement
        self.triplets_df = self.data_loader.load_triplets(sample_frac)
        self.tracks_df = self.data_loader.load_tracks()
        
        # Filtrage pour cohérence
        valid_songs = set(self.tracks_df['song_id'])
        self.triplets_df = self.triplets_df[
            self.triplets_df['song_id'].isin(valid_songs)
        ]
        
        # Mappings
        unique_users = self.triplets_df['user_id'].unique()
        unique_songs = self.triplets_df['song_id'].unique()
        
        self.user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
        self.song_to_idx = {song: idx for idx, song in enumerate(unique_songs)}
        
        # Matrice user-item
        user_indices = self.triplets_df['user_id'].map(self.user_to_idx)
        song_indices = self.triplets_df['song_id'].map(self.song_to_idx)
        play_counts = self.triplets_df['play_count']
        
        self.user_item_matrix = csr_matrix(
            (play_counts, (user_indices, song_indices)),
            shape=(len(self.user_to_idx), len(self.song_to_idx))
        )
        
        logger.info("Setup terminé!")
        logger.info(f"Utilisateurs: {len(self.user_to_idx):,}")
        logger.info(f"Chansons: {len(self.song_to_idx):,}")
        logger.info(f"Sparsité: {(1 - self.user_item_matrix.nnz / np.prod(self.user_item_matrix.shape))*100:.2f}%")
    
    def get_top_tracks(self, n=250):
        """TOP-250 TRACKS - Version simple"""
        popularity = self.triplets_df.groupby('song_id')['play_count'].sum()
        top_songs = popularity.nlargest(n)
        
        results = []
        for i, (song_id, play_count) in enumerate(top_songs.items(), 1):
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            if not track_info.empty:
                track_info = track_info.iloc[0]
                results.append({
                    'index': i,
                    'artist': track_info['artist'],
                    'title': track_info['title'],
                    'play_count': int(play_count)
                })
        
        return pd.DataFrame(results)
    
    def simple_user_recommendations(self, user_id, n=10):
        """Recommandations utilisateur simplifiées"""
        if user_id not in self.user_to_idx:
            logger.warning(f"Utilisateur {user_id} non trouvé")
            return pd.DataFrame()
        
        user_idx = self.user_to_idx[user_id]
        user_vector = self.user_item_matrix[user_idx]
        
        # Calcul simple de similarité avec quelques utilisateurs
        sample_size = min(1000, self.user_item_matrix.shape[0])
        sample_indices = np.random.choice(
            self.user_item_matrix.shape[0], 
            sample_size, 
            replace=False
        )
        sample_matrix = self.user_item_matrix[sample_indices]
        
        similarities = cosine_similarity(user_vector, sample_matrix).flatten()
        
        # Agréger les recommandations
        recommendations = np.zeros(self.user_item_matrix.shape[1])
        user_items = user_vector.toarray().flatten()
        
        for i, sim_idx in enumerate(sample_indices):
            if sim_idx == user_idx:
                continue
            
            similarity = similarities[i]
            similar_user_items = self.user_item_matrix[sim_idx].toarray().flatten()
            
            # Items non écoutés par l'utilisateur cible
            unseen_mask = (user_items == 0) & (similar_user_items > 0)
            recommendations[unseen_mask] += similarity * similar_user_items[unseen_mask]
        
        # Top recommandations
        top_indices = np.argsort(recommendations)[::-1][:n]
        
        # Formatage des résultats
        results = []
        idx_to_song = {v: k for k, v in self.song_to_idx.items()}
        
        for i, song_idx in enumerate(top_indices, 1):
            if recommendations[song_idx] > 0:
                song_id = idx_to_song[song_idx]
                track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
                if not track_info.empty:
                    track_info = track_info.iloc[0]
                    results.append({
                        'index': i,
                        'artist': track_info['artist'],
                        'title': track_info['title'],
                        'score': recommendations[song_idx]
                    })
        
        return pd.DataFrame(results)

def evaluate_precision_at_k(true_items, recommended_items, k=10):
    """Calcule Precision@k simple"""
    if len(true_items) == 0:
        return 0.0
    
    recommended_k = set(recommended_items[:k])
    true_set = set(true_items)
    
    hits = len(recommended_k.intersection(true_set))
    return hits / k

# EXEMPLE D'UTILISATION
if __name__ == "__main__":
    # Setup rapide
    recommender = QuickStartRecommender("../data")
    recommender.quick_setup(sample_frac=0.05)  # 5% des données pour test
    
    # Test Top-250
    top_tracks = recommender.get_top_tracks(10)
    print("TOP-10 TRACKS:")
    print(top_tracks)
    print()
    
    # Test recommandations utilisateur
    sample_user = list(recommender.user_to_idx.keys())[0]
    user_recs = recommender.simple_user_recommendations(sample_user, n=5)
    print(f"RECOMMANDATIONS POUR {sample_user}:")
    print(user_recs)

# TEMPLATE POUR COLLECTIONS THÉMATIQUES
class ThematicCollectionTemplate:
    """Template pour collections thématiques"""
    
    def __init__(self, lyrics_df, word_mapping, tracks_df, triplets_df):
        self.lyrics_df = lyrics_df
        self.word_mapping = word_mapping
        self.tracks_df = tracks_df
        self.triplets_df = triplets_df
        
        # Index inversé mot -> index
        self.word_to_idx = {word: idx for idx, word in word_mapping.items()}
    
    def get_love_songs(self, n=50):
        """Collection de chansons d'amour - méthode baseline"""
        love_keywords = ['love', 'heart', 'kiss', 'babi', 'darlin', 'honey']
        
        song_scores = defaultdict(int)
        
        for _, row in self.lyrics_df.iterrows():
            track_id = row['track_id']
            word_counts = row['word_counts']
            
            # Score basé sur les mots-clés d'amour
            love_score = 0
            for keyword in love_keywords:
                if keyword in self.word_to_idx:
                    word_idx = self.word_to_idx[keyword]
                    if word_idx in word_counts:
                        love_score += word_counts[word_idx]
            
            if love_score > 0:
                song_scores[track_id] = love_score
        
        # Tri par score puis popularité
        sorted_tracks = sorted(
            song_scores.items(), 
            key=lambda x: x[1], 
            reverse=True
        )
        
        # Formatage avec info tracks
        results = []
        for i, (track_id, score) in enumerate(sorted_tracks[:n], 1):
            # Mapping track_id -> song_id
            track_match = self.tracks_df[self.tracks_df['track_id'] == track_id]
            if not track_match.empty:
                song_id = track_match.iloc[0]['song_id']
                
                # Popularité
                popularity = self.triplets_df[
                    self.triplets_df['song_id'] == song_id
                ]['play_count'].sum()
                
                results.append({
                    'index': i,
                    'artist': track_match.iloc[0]['artist'],
                    'title': track_match.iloc[0]['title'],
                    'love_score': score,
                    'play_count': popularity
                })
        
        # Tri final par play_count
        results.sort(key=lambda x: x['play_count'], reverse=True)
        return pd.DataFrame(results[:n])

# TEMPLATE TRAIN/TEST SPLIT
class SimpleEvaluator:
    """Evaluateur simple pour démarrer"""
    
    def __init__(self, user_item_matrix, user_to_idx, song_to_idx):
        self.user_item_matrix = user_item_matrix
        self.user_to_idx = user_to_idx
        self.song_to_idx = song_to_idx
    
    def simple_train_test_split(self, test_ratio=0.2):
        """Split simple - masque aléatoirement des interactions"""
        train_matrix = self.user_item_matrix.copy()
        test_interactions = {}
        
        np.random.seed(42)
        
        for user_idx in range(self.user_item_matrix.shape[0]):
            user_songs = self.user_item_matrix[user_idx].nonzero()[1]
            
            if len(user_songs) > 2:  # Au moins 3 chansons
                n_test = max(1, int(len(user_songs) * test_ratio))
                test_songs = np.random.choice(user_songs, n_test, replace=False)
                
                # Masquer dans train
                for song_idx in test_songs:
                    train_matrix[user_idx, song_idx] = 0
                
                # Sauvegarder pour test
                test_interactions[user_idx] = test_songs
        
        train_matrix.eliminate_zeros()
        return train_matrix, test_interactions
    
    def evaluate_recommendations(self, recommendations_dict, test_interactions, k=10):
        """Évalue les recommandations avec P@k"""
        precisions = []
        idx_to_song = {v: k for k, v in self.song_to_idx.items()}
        
        for user_id, recs in recommendations_dict.items():
            if user_id not in self.user_to_idx:
                continue
                
            user_idx = self.user_to_idx[user_id]
            if user_idx not in test_interactions:
                continue
            
            # Items de test pour cet utilisateur
            true_items = test_interactions[user_idx]
            
            # Items recommandés (convertir song_id en indices)
            rec_items = []
            for rec in recs[:k]:
                if 'song_id' in rec and rec['song_id'] in self.song_to_idx:
                    rec_items.append(self.song_to_idx[rec['song_id']])
            
            # Calculer P@k
            hits = len(set(rec_items).intersection(set(true_items)))
            precision = hits / k
            precisions.append(precision)
        
        return np.mean(precisions) if precisions else 0.0

INFO:__main__:Setup rapide avec 5.0% des données
INFO:__main__:Chargement des triplets...
INFO:__main__:Triplets chargés: 2,418,679 interactions
INFO:__main__:Chargement du mapping tracks...
INFO:__main__:Tracks chargés: 1,000,000
INFO:__main__:Setup terminé!
INFO:__main__:Utilisateurs: 753,334
INFO:__main__:Chansons: 205,653
INFO:__main__:Sparsité: 100.00%


TOP-10 TRACKS:
   index                                             artist  \
0      1                                      Dwight Yoakam   
1      2                                              Björk   
2      3                                      Kings Of Leon   
3      4                                           Harmonia   
4      5  Barry Tuckwell/Academy of St Martin-in-the-Fie...   
5      6                             Florence + The Machine   
6      7                                        OneRepublic   
7      8                                   Five Iron Frenzy   
8      9                                           Tub Ring   
9     10                                    Alliance Ethnik   

                                               title  play_count  
0                                     You're The One       36584  
1                                               Undo       32141  
2                                            Revelry       27803  
3                      

In [3]:
# Stratégie de chargement par chunks pour gros fichiers
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

def load_triplets_chunked(filepath, chunksize=100000):
    """Charge les triplets par chunks pour éviter les problèmes mémoire"""
    chunks = []
    for chunk in pd.read_csv(filepath, sep='\t', chunksize=chunksize, 
                             names=['user_id', 'song_id', 'play_count']):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

In [4]:
class DataPreprocessor:
    def __init__(self):
        self.user_to_idx = {}
        self.song_to_idx = {}
        self.idx_to_user = {}
        self.idx_to_song = {}
    
    def create_mappings(self, triplets_df):
        """Crée les mappings bidirectionnels user/song ↔ index"""
        unique_users = triplets_df['user_id'].unique()
        unique_songs = triplets_df['song_id'].unique()
        
        self.user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
        self.song_to_idx = {song: idx for idx, song in enumerate(unique_songs)}
        self.idx_to_user = {idx: user for user, idx in self.user_to_idx.items()}
        self.idx_to_song = {idx: song for song, idx in self.song_to_idx.items()}

In [5]:
def build_user_item_matrix(self, triplets_df):
    """Construit la matrice user-item sparse"""
    user_indices = triplets_df['user_id'].map(self.user_to_idx)
    song_indices = triplets_df['song_id'].map(self.song_to_idx)
    play_counts = triplets_df['play_count']
    
    matrix = csr_matrix((play_counts, (user_indices, song_indices)),
                       shape=(len(self.user_to_idx), len(self.song_to_idx)))
    return matrix

In [6]:
class PopularityRecommender:
    def __init__(self, tracks_df, triplets_df):
        self.tracks_df = tracks_df
        self.triplets_df = triplets_df
    
    def get_top_tracks(self, n=250):
        """Retourne les n tracks les plus populaires"""
        popularity = self.triplets_df.groupby('song_id')['play_count'].sum()
        top_songs = popularity.nlargest(n).index
        
        result = []
        for i, song_id in enumerate(top_songs, 1):
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id].iloc[0]
            result.append({
                'index': i,
                'artist': track_info['artist'],
                'title': track_info['title'],
                'play_count': popularity[song_id]
            })
        
        return pd.DataFrame(result)

In [7]:
class GenreRecommender:
    def __init__(self, tracks_df, triplets_df, genre_df):
        self.tracks_df = tracks_df
        self.triplets_df = triplets_df
        self.genre_df = genre_df
    
    def get_top_by_genre(self, genre, n=100):
        """Retourne les n tracks les plus populaires pour un genre"""
        # Filtrer par genre
        genre_tracks = self.genre_df[self.genre_df['majority_genre'] == genre]['track_id']
        
        # Mapper track_id vers song_id
        genre_songs = self.tracks_df[self.tracks_df['track_id'].isin(genre_tracks)]['song_id']
        
        # Calculer popularité dans ce genre
        genre_triplets = self.triplets_df[self.triplets_df['song_id'].isin(genre_songs)]
        popularity = genre_triplets.groupby('song_id')['play_count'].sum()
        
        return self._format_results(popularity.nlargest(n))

In [8]:
class ThematicCollections:
    def __init__(self, lyrics_df, tracks_df, triplets_df):
        self.lyrics_df = lyrics_df
        self.tracks_df = tracks_df
        self.triplets_df = triplets_df
        self.word_mapping = self._load_word_mapping()
    
    def _load_word_mapping(self):
        """Charge le mapping des mots du dataset musiXmatch"""
        # Première ligne contient les top words séparés par virgules
        with open('mxm_dataset_train.txt', 'r') as f:
            for line in f:
                if line.startswith('%'):
                    words = line[1:].strip().split(',')
                    return {i+1: word for i, word in enumerate(words)}
    
    def get_thematic_collection(self, theme, n=50, method='baseline'):
        """Retourne une collection thématique"""
        if method == 'baseline':
            return self._baseline_method(theme, n)
        elif method == 'word2vec':
            return self._word2vec_method(theme, n)
        elif method == 'classification':
            return self._classification_method(theme, n)
    
    def _baseline_method(self, theme, n):
        """Méthode baseline : recherche directe du mot-clé"""
        theme_word_idx = None
        for idx, word in self.word_mapping.items():
            if theme.lower() in word.lower():
                theme_word_idx = idx
                break
        
        if not theme_word_idx:
            return pd.DataFrame()  # Mot non trouvé
        
        # Filtrer les chansons contenant ce mot
        theme_songs = []
        for _, row in self.lyrics_df.iterrows():
            word_counts = self._parse_sparse_format(row)
            if theme_word_idx in word_counts:
                theme_songs.append((row['track_id'], word_counts[theme_word_idx]))
        
        # Trier par fréquence du mot-thème puis par popularité
        return self._rank_and_format(theme_songs, n)

In [9]:
class UserBasedCF:
    def __init__(self, user_item_matrix, preprocessor):
        self.user_item_matrix = user_item_matrix
        self.preprocessor = preprocessor
        self.user_similarity_matrix = None
    
    def compute_user_similarity(self, metric='cosine'):
        """Calcule la matrice de similarité entre utilisateurs"""
        from sklearn.metrics.pairwise import cosine_similarity
        
        # Normalisation pour cosine similarity
        normalized_matrix = self.user_item_matrix.copy()
        normalized_matrix.data = normalized_matrix.data / np.sqrt(
            np.array(normalized_matrix.sum(axis=1)).flatten()
        )
        
        self.user_similarity_matrix = cosine_similarity(normalized_matrix)
        return self.user_similarity_matrix
    
    def recommend_for_user(self, user_id, n_recommendations=10, n_neighbors=50):
        """Génère des recommandations pour un utilisateur"""
        if user_id not in self.preprocessor.user_to_idx:
            return pd.DataFrame()  # Utilisateur non trouvé
        
        user_idx = self.preprocessor.user_to_idx[user_id]
        
        # Trouver les utilisateurs similaires
        user_similarities = self.user_similarity_matrix[user_idx]
        similar_users = np.argsort(user_similarities)[::-1][1:n_neighbors+1]
        
        # Calculer les scores de recommandation
        user_items = self.user_item_matrix[user_idx].toarray()[0]
        recommendations = np.zeros(self.user_item_matrix.shape[1])
        
        for similar_user_idx in similar_users:
            similarity = user_similarities[similar_user_idx]
            similar_user_items = self.user_item_matrix[similar_user_idx].toarray()[0]
            
            # Items que l'utilisateur cible n'a pas encore écoutés
            unseen_items = (user_items == 0) & (similar_user_items > 0)
            recommendations[unseen_items] += similarity * similar_user_items[unseen_items]
        
        # Sélectionner top-N recommandations
        top_items = np.argsort(recommendations)[::-1][:n_recommendations]
        
        return self._format_recommendations(top_items, recommendations)

In [10]:
class ItemBasedCF:
    def __init__(self, user_item_matrix, preprocessor):
        self.user_item_matrix = user_item_matrix
        self.preprocessor = preprocessor
        self.item_similarity_matrix = None
    
    def compute_item_similarity(self, metric='cosine'):
        """Calcule la matrice de similarité entre items"""
        from sklearn.metrics.pairwise import cosine_similarity
        
        # Transposer la matrice pour avoir items x users
        item_user_matrix = self.user_item_matrix.T
        self.item_similarity_matrix = cosine_similarity(item_user_matrix)
        return self.item_similarity_matrix
    
    def recommend_for_item(self, song_id, n_recommendations=10):
        """Génère des recommandations pour un item donné"""
        if song_id not in self.preprocessor.song_to_idx:
            return pd.DataFrame()
        
        item_idx = self.preprocessor.song_to_idx[song_id]
        item_similarities = self.item_similarity_matrix[item_idx]
        
        # Exclure l'item lui-même et sélectionner les plus similaires
        similar_items = np.argsort(item_similarities)[::-1][1:n_recommendations+1]
        
        return self._format_item_recommendations(similar_items, item_similarities)

In [None]:
class RecommenderEvaluator:
    def __init__(self, user_item_matrix, test_ratio=0.2):
        self.user_item_matrix = user_item_matrix
        self.test_ratio = test_ratio
        self.train_matrix = None
        self.test_matrix = None
    
    def create_train_test_split(self):
        """Crée un split train/test en masquant aléatoirement des interactions"""
        np.random.seed(42)
        
        # Copier la matrice originale
        train_matrix = self.user_item_matrix.copy()
        test_matrix = self.user_item_matrix.copy()
        test_matrix.data = np.zeros_like(test_matrix.data)
        
        # Pour chaque utilisateur, masquer un pourcentage d'interactions
        for user_idx in range(self.user_item_matrix.shape[0]):
            user_items = self.user_item_matrix[user_idx].nonzero()[1]
            
            if len(user_items) > 1:  # Au moins 2 interactions
                n_test = max(1, int(len(user_items) * self.test_ratio))
                test_items = np.random.choice(user_items, n_test, replace=False)
                
                # Masquer dans train, révéler dans test
                for item_idx in test_items:
                    test_matrix[user_idx, item_idx] = train_matrix[user_idx, item_idx]
                    train_matrix[user_idx, item_idx] = 0
        
        self.train_matrix = train_matrix.eliminate_zeros()
        self.test_matrix = test_matrix.eliminate_zeros()
        
        return self.train_matrix, self.test_matrix
    
    def evaluate_precision_at_k(self, recommendations_dict, k=10):
        """Calcule la Precision@k moyenne"""
        precisions = []
        
        for user_id, recommendations in recommendations_dict.items():
            if user_id not in self.preprocessor.user_to_idx:
                continue
            
            user_idx = self.preprocessor.user_to_idx[user_id]
            actual_items = set(self.test_matrix[user_idx].nonzero()[1])
            
            if len(actual_items) == 0:
                continue  # Pas d'items de test pour cet utilisateur
            
            # Extraire les k premiers items recommandés
            recommended_items = set([
                self.preprocessor.song_to_idx[rec['song_id']] 
                for rec in recommendations[:k]
                if rec['song_id'] in self.preprocessor.song_to_idx
            ])
            
            # Calculer precision
            hits = len(recommended_items.intersection(actual_items))
            precision = hits / k
            precisions.append(precision)
        
        return np.mean(precisions) if precisions else 0.0

In [12]:
# Structure modulaire recommandée

class MySpotifyRecommender:
    """Classe principale orchestrant tous les types de recommandations"""
    
    def __init__(self, data_path):
        self.data_loader = DataLoader(data_path)
        self.preprocessor = DataPreprocessor()
        self.popularity_recommender = None
        self.genre_recommender = None
        self.thematic_recommender = None
        self.user_cf = None
        self.item_cf = None
    
    def initialize(self):
        """Initialise tous les composants"""
        # Chargement des données
        triplets_df = self.data_loader.load_triplets()
        tracks_df = self.data_loader.load_tracks()
        lyrics_df = self.data_loader.load_lyrics()
        genre_df = self.data_loader.load_genres()
        
        # Preprocessing
        self.preprocessor.create_mappings(triplets_df)
        user_item_matrix = self.preprocessor.build_user_item_matrix(triplets_df)
        
        # Initialisation des recommandeurs
        self.popularity_recommender = PopularityRecommender(tracks_df, triplets_df)
        self.genre_recommender = GenreRecommender(tracks_df, triplets_df, genre_df)
        self.thematic_recommender = ThematicCollections(lyrics_df, tracks_df, triplets_df)
        self.user_cf = UserBasedCF(user_item_matrix, self.preprocessor)
        self.item_cf = ItemBasedCF(user_item_matrix, self.preprocessor)
    
    def get_top_tracks(self, n=250):
        """Interface pour les top tracks"""
        return self.popularity_recommender.get_top_tracks(n)
    
    def get_top_by_genre(self, genre, n=100):
        """Interface pour les top par genre"""
        return self.genre_recommender.get_top_by_genre(genre, n)
    
    def get_thematic_collection(self, theme, n=50, method='baseline'):
        """Interface pour les collections thématiques"""
        if not self.thematic_recommender:
            logger.error("Recommandeur thématique non initialisé")
            return pd.DataFrame()
            
        themes = {
            'love': ['love', 'heart', 'kiss', 'romance'],
            'war': ['war', 'fight', 'battle', 'soldier'],
            'happiness': ['happy', 'joy', 'smile', 'laugh'],
            'loneliness': ['alone', 'lonely', 'silence', 'empty'],
            'money': ['money', 'rich', 'gold', 'dollar']
        }
        
        if theme not in themes:
            logger.error(f"Thème {theme} non supporté")
            return pd.DataFrame()
            
        return self.thematic_recommender.get_thematic_collection(
            themes[theme], 
            n=n, 
            method=method
        )
    
    def get_similar_users_recommendations(self, user_id, n=10):
        """Interface pour les recommandations basées sur les utilisateurs similaires"""
        if not self.user_cf:
            logger.error("Recommandeur collaboratif utilisateurs non initialisé")
            return pd.DataFrame()
            
        if not self.user_cf.user_similarity_matrix:
            self.user_cf.compute_user_similarity()
            
        return self.user_cf.recommend_for_user(user_id, n_recommendations=n)
    
    def get_similar_tracks_recommendations(self, song_id, n=10):
        """Interface pour les recommandations basées sur les morceaux similaires"""
        if not self.item_cf:
            logger.error("Recommandeur collaboratif items non initialisé")
            return pd.DataFrame()
            
        if not self.item_cf.item_similarity_matrix:
            self.item_cf.compute_item_similarity()
            
        return self.item_cf.recommend_for_item(song_id, n_recommendations=n)
    
    def evaluate_recommendations(self, test_ratio=0.2, k=10):
        """Interface pour l'évaluation des recommandations"""
        if not self.user_cf or not self.item_cf:
            logger.error("Recommandeurs non initialisés")
            return {}
            
        evaluator = RecommenderEvaluator(
            self.preprocessor.user_item_matrix, 
            test_ratio=test_ratio
        )
        
        # Split train/test
        train_matrix, test_matrix = evaluator.create_train_test_split()
        
        # Évaluer les recommandations utilisateurs
        user_recommendations = {}
        for user_id in list(self.preprocessor.user_to_idx.keys())[:100]:  # Échantillon pour test
            recs = self.get_similar_users_recommendations(user_id, k)
            if not recs.empty:
                user_recommendations[user_id] = recs
        
        # Évaluer les recommandations items
        item_recommendations = {}
        for song_id in list(self.preprocessor.song_to_idx.keys())[:100]:  # Échantillon pour test
            recs = self.get_similar_tracks_recommendations(song_id, k)
            if not recs.empty:
                item_recommendations[song_id] = recs
                
        results = {
            'user_precision@k': evaluator.evaluate_precision_at_k(user_recommendations, k),
            'item_precision@k': evaluator.evaluate_precision_at_k(item_recommendations, k)
        }
        
        logger.info(f"Résultats d'évaluation: {results}")
        return results

    def get_all_recommendations(self, user_id):
        """Interface pour obtenir toutes les recommandations pour un utilisateur"""
        results = {
            'top_tracks': self.get_top_tracks(n=10),
            'user_based': self.get_similar_users_recommendations(user_id, n=10),
            'thematic': {
                'love': self.get_thematic_collection('love', n=5),
                'happiness': self.get_thematic_collection('happiness', n=5)
            }
        }
        
        # Ajouter les recommandations par genre
        genres = ['Rock', 'Rap', 'Jazz', 'Electronic', 'Pop', 
                 'Blues', 'Country', 'Reggae', 'New Age']
        results['by_genre'] = {
            genre: self.get_top_by_genre(genre, n=5) 
            for genre in genres
        }
        
        return results
    # ... autres méthodes d'interface

In [13]:
class RecommendationException(Exception):
    """Exception personnalisée pour les erreurs de recommandation"""
    pass

def safe_recommend(func):
    """Décorateur pour gérer les erreurs de recommandation"""
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except KeyError as e:
            logger.warning(f"Clé non trouvée: {e}")
            return pd.DataFrame()  # Retour par défaut
        except Exception as e:
            logger.error(f"Erreur dans {func.__name__}: {e}")
            raise RecommendationException(f"Erreur de recommandation: {e}")
    return wrapper

In [14]:
# Exemple d'optimisation mémoire pour gros datasets
def compute_similarity_chunked(matrix, chunk_size=1000):
    """Calcule la similarité par chunks pour éviter OOM"""
    n_items = matrix.shape[0]
    similarity_matrix = np.zeros((n_items, n_items))
    
    for i in range(0, n_items, chunk_size):
        end_i = min(i + chunk_size, n_items)
        chunk_i = matrix[i:end_i]
        
        for j in range(0, n_items, chunk_size):
            end_j = min(j + chunk_size, n_items)
            chunk_j = matrix[j:end_j]
            
            similarity_matrix[i:end_i, j:end_j] = cosine_similarity(chunk_i, chunk_j)
    
    return similarity_matrix

In [15]:
class DiscoverWeekly:
    def __init__(self, user_cf, content_cf, popularity_cf):
        self.user_cf = user_cf
        self.content_cf = content_cf  
        self.popularity_cf = popularity_cf
    
    def generate_playlist(self, user_id, n_tracks=30):
        """Génère une playlist découverte personnalisée"""
        # 40% collaborative filtering
        cf_recs = self.user_cf.recommend_for_user(user_id, n_tracks//2)
        
        # 30% content-based sur artistes similaires
        user_artists = self._get_user_top_artists(user_id)
        content_recs = self.content_cf.recommend_similar_artists(user_artists)
        
        # 30% découverte (popularité récente + diversité)
        discovery_recs = self.popularity_cf.get_trending_tracks(
            exclude_user_history=user_id
        )
        
        # Mélange pondéré avec diversification
        return self._blend_and_diversify([cf_recs, content_recs, discovery_recs])

In [17]:
class ArtistRadio:
    def __init__(self, content_analyzer, collaborative_filter):
        self.content_analyzer = content_analyzer
        self.collaborative_filter = collaborative_filter
    
    def create_artist_radio(self, seed_artist, n_tracks=50):
        """Crée une radio basée sur un artiste seed"""
        # Analyser le profil musical de l'artiste
        artist_profile = self.content_analyzer.get_artist_profile(seed_artist)
        
        # Expansion par genres/sous-genres
        similar_genre_artists = self._find_genre_expansion(artist_profile)
        
        # Expansion collaborative (artistes co-écoutés)
        collab_artists = self.collaborative_filter.find_colistened_artists(seed_artist)
        
        # Diversification temporelle et énergétique
        return self._create_radio_flow(seed_artist, similar_genre_artists, collab_artists)

In [18]:
class PersonalizedPlaylists:
    def __init__(self):
        self.user_profiler = UserMoodProfiler()
        self.temporal_analyzer = TemporalAnalyzer()
    
    def generate_mood_playlist(self, user_id, time_of_day, context="workout"):
        """Génère une playlist adaptée au contexte"""
        # Analyser l'historique temporel de l'utilisateur
        temporal_prefs = self.temporal_analyzer.get_time_preferences(user_id, time_of_day)
        
        # Profil émotionnel basé sur les paroles
        mood_profile = self.user_profiler.extract_mood_from_lyrics(user_id)
        
        # Adaptation contextuelle
        context_tracks = self._get_context_appropriate_tracks(
            context, temporal_prefs, mood_profile
        )
        
        return self._optimize_playlist_flow(context_tracks)

In [19]:
class AdvancedMetrics:
    @staticmethod
    def calculate_diversity(recommendations):
        """Mesure la diversité intra-liste des recommandations"""
        # Diversité de genres, d'artistes, d'années, etc.
        pass
    
    @staticmethod
    def calculate_novelty(recommendations, user_history):
        """Mesure la nouveauté par rapport à l'historique utilisateur"""
        pass
    
    @staticmethod
    def calculate_serendipity(recommendations, expected_items):
        """Mesure l'effet de surprise positif"""
        pass

In [1]:
# data_check.py - Script pour vérifier la qualité des données
import pandas as pd
from pathlib import Path
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def check_data_files():
    """Vérifie que tous les fichiers de données sont présents et lisibles"""
    
    data_files = {
        'train_triplets.txt': 'Triplets (user_id, song_id, play_count)',
        'p02_unique_tracks.txt': 'Informations des tracks',
        'mxm_dataset_train.txt': 'Données de paroles',
        'p02_msd_tagtraum_cd2.cls': 'Genres musicaux'
    }
    
    print("=== VÉRIFICATION DES FICHIERS ===")
    for filename, description in data_files.items():
        filepath = Path(filename)
        if filepath.exists():
            size_mb = filepath.stat().st_size / (1024 * 1024)
            print(f"✅ {filename} ({description}) - {size_mb:.1f} MB")
        else:
            print(f"❌ {filename} - MANQUANT")
    print()

def quick_data_exploration():
    """Exploration rapide des données"""
    
    print("=== EXPLORATION RAPIDE ===")
    
    # Triplets
    try:
        triplets = pd.read_csv('train_triplets.txt', sep='\t', 
                              names=['user_id', 'song_id', 'play_count'], 
                              nrows=1000)  # Lire seulement 1000 lignes pour test
        print(f"Triplets - Exemple:")
        print(triplets.head(3))
        print(f"Shape: {triplets.shape}")
        print(f"Users uniques: {triplets['user_id'].nunique()}")
        print(f"Songs uniques: {triplets['song_id'].nunique()}")
        print()
        
    except Exception as e:
        print(f"❌ Erreur triplets: {e}")
    
    # Tracks
    try:
        tracks = pd.read_csv('p02_unique_tracks.txt', sep='<SEP>', 
                            names=['track_id', 'song_id', 'artist', 'title'],
                            engine='python', nrows=100)
        print(f"Tracks - Exemple:")
        print(tracks.head(3))
        print(f"Shape: {tracks.shape}")
        print()
        
    except Exception as e:
        print(f"❌ Erreur tracks: {e}")
    
    # Genres
    try:
        with open('p02_msd_tagtraum_cd2.cls', 'r') as f:
            lines = [line.strip() for line in f.readlines()[:10] if line.strip() and not line.startswith('#')]
        
        print("Genres - Exemple:")
        for line in lines[:5]:
            print(line)
        print()
        
    except Exception as e:
        print(f"❌ Erreur genres: {e}")
    
    # Paroles
    try:
        with open('mxm_dataset_train.txt', 'r') as f:
            lines = [line.strip() for line in f.readlines()[:20]]
        
        print("Paroles - Exemple:")
        for line in lines[:5]:
            if not line.startswith('#'):
                print(line[:100] + "..." if len(line) > 100 else line)
        print()
        
    except Exception as e:
        print(f"❌ Erreur paroles: {e}")

def estimate_memory_usage():
    """Estime l'usage mémoire"""
    
    print("=== ESTIMATION MÉMOIRE ===")
    
    try:
        # Compter les lignes des triplets
        with open('train_triplets.txt', 'r') as f:
            line_count = sum(1 for line in f)
        
        # Estimation: ~50-100 bytes par ligne en mémoire
        estimated_mb = (line_count * 80) / (1024 * 1024)
        print(f"Triplets: ~{line_count:,} lignes")
        print(f"Mémoire estimée: ~{estimated_mb:.1f} MB")
        
        if estimated_mb > 4000:  # Plus de 4GB
            print("⚠️  ATTENTION: Dataset très volumineux!")
            print("   Considérez utiliser des chunks pour le chargement")
        elif estimated_mb > 1000:  # Plus de 1GB
            print("⚠️  Dataset moyen-volumineux, monitoring mémoire recommandé")
        else:
            print("✅ Taille de dataset gérable")
            
    except Exception as e:
        print(f"❌ Erreur estimation: {e}")

if __name__ == "__main__":
    check_data_files()
    quick_data_exploration()
    estimate_memory_usage()

=== VÉRIFICATION DES FICHIERS ===
✅ train_triplets.txt (Triplets (user_id, song_id, play_count)) - 2862.6 MB
✅ p02_unique_tracks.txt (Informations des tracks) - 80.2 MB
✅ mxm_dataset_train.txt (Données de paroles) - 98.0 MB
✅ p02_msd_tagtraum_cd2.cls (Genres musicaux) - 7.1 MB

=== EXPLORATION RAPIDE ===
Triplets - Exemple:
                                    user_id             song_id  play_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995           1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9           1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B           2
Shape: (1000, 3)
Users uniques: 13
Songs uniques: 951

Tracks - Exemple:
             track_id             song_id            artist              title
0  TRMMMYQ128F932D901  SOQMMHC12AB0180CB8  Faster Pussy cat       Silent Night
1  TRMMMKD128F425225D  SOVFVAK12A8C1350D9  Karkkiautomaatti        Tanssi vaan
2  TRMMMRX128F93187D9  SOGTUKN12AB017F4F1    Hudson Mohawke 

In [3]:
# main.py - Script principal pour tester MySpotify
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import logging
from pathlib import Path

# Configuration du logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class DataLoader:
    """Classe pour charger tous les datasets"""
    
    def __init__(self, data_path):
        self.data_path = Path(data_path)
    
    def load_triplets(self, filename="train_triplets.txt"):
        """Charge les triplets (user_id, song_id, play_count)"""
        logger.info(f"Chargement des triplets depuis {filename}...")
        filepath = self.data_path / filename
        
        df = pd.read_csv(filepath, sep='\t', 
                        names=['user_id', 'song_id', 'play_count'])
        logger.info(f"Triplets chargés: {len(df)} interactions")
        return df
    
    def load_tracks(self, filename="p02_unique_tracks.txt"):
        """Charge les informations des tracks"""
        logger.info(f"Chargement des tracks depuis {filename}...")
        filepath = self.data_path / filename
        
        df = pd.read_csv(filepath, sep='<SEP>', 
                        names=['track_id', 'song_id', 'artist', 'title'],
                        engine='python')
        logger.info(f"Tracks chargés: {len(df)} pistes")
        return df
    
    def load_lyrics(self, filename="mxm_dataset_train.txt"):
        """Charge les données de paroles"""
        logger.info(f"Chargement des paroles depuis {filename}...")
        filepath = self.data_path / filename
        
        lyrics_data = []
        word_mapping = {}
        
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                
                # Skip comments
                if line.startswith('#'):
                    continue
                
                # Get word mapping
                if line.startswith('%'):
                    words = line[1:].split(',')
                    word_mapping = {i+1: word for i, word in enumerate(words)}
                    continue
                
                # Parse lyrics data
                if line:
                    parts = line.split(',')
                    track_id = parts[0]
                    mxm_track_id = parts[1]
                    
                    word_counts = {}
                    for part in parts[2:]:
                        if ':' in part:
                            word_idx, count = part.split(':')
                            word_counts[int(word_idx)] = int(count)
                    
                    lyrics_data.append({
                        'track_id': track_id,
                        'mxm_track_id': mxm_track_id,
                        'word_counts': word_counts
                    })
        
        df = pd.DataFrame(lyrics_data)
        logger.info(f"Paroles chargées: {len(df)} pistes avec paroles")
        return df, word_mapping
    
    def load_genres(self, filename="p02_msd_tagtraum_cd2.cls"):
        """Charge les genres"""
        logger.info(f"Chargement des genres depuis {filename}...")
        filepath = self.data_path / filename
        
        genres_data = []
        with open(filepath, 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#'):
                    parts = line.split('\t')
                    track_id = parts[0]
                    majority_genre = parts[1]
                    minority_genre = parts[2] if len(parts) > 2 else None
                    
                    genres_data.append({
                        'track_id': track_id,
                        'majority_genre': majority_genre,
                        'minority_genre': minority_genre
                    })
        
        df = pd.DataFrame(genres_data)
        logger.info(f"Genres chargés: {len(df)} pistes avec genres")
        return df

class DataPreprocessor:
    """Classe pour préprocesser les données"""
    
    def __init__(self):
        self.user_to_idx = {}
        self.song_to_idx = {}
        self.idx_to_user = {}
        self.idx_to_song = {}
    
    def create_mappings(self, triplets_df):
        """Crée les mappings bidirectionnels"""
        logger.info("Création des mappings user/song...")
        
        unique_users = triplets_df['user_id'].unique()
        unique_songs = triplets_df['song_id'].unique()
        
        self.user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
        self.song_to_idx = {song: idx for idx, song in enumerate(unique_songs)}
        self.idx_to_user = {idx: user for user, idx in self.user_to_idx.items()}
        self.idx_to_song = {idx: song for song, idx in self.song_to_idx.items()}
        
        logger.info(f"Mappings créés: {len(self.user_to_idx)} users, {len(self.song_to_idx)} songs")
    
    def build_user_item_matrix(self, triplets_df):
        """Construit la matrice user-item sparse"""
        logger.info("Construction de la matrice user-item...")
        
        user_indices = triplets_df['user_id'].map(self.user_to_idx)
        song_indices = triplets_df['song_id'].map(self.song_to_idx)
        play_counts = triplets_df['play_count']
        
        matrix = csr_matrix(
            (play_counts, (user_indices, song_indices)),
            shape=(len(self.user_to_idx), len(self.song_to_idx))
        )
        
        logger.info(f"Matrice construite: {matrix.shape}, sparsité: {1 - matrix.nnz / np.prod(matrix.shape):.4f}")
        return matrix

class PopularityRecommender:
    """Recommandeur basé sur la popularité"""
    
    def __init__(self, tracks_df, triplets_df):
        self.tracks_df = tracks_df
        self.triplets_df = triplets_df
    
    def get_top_tracks(self, n=250):
        """Retourne les n tracks les plus populaires"""
        logger.info(f"Génération du top-{n} tracks...")
        
        # Calculer la popularité globale
        popularity = self.triplets_df.groupby('song_id')['play_count'].sum().sort_values(ascending=False)
        top_songs = popularity.head(n)
        
        results = []
        for i, (song_id, play_count) in enumerate(top_songs.items(), 1):
            # Trouver les infos du track
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            if not track_info.empty:
                track_info = track_info.iloc[0]
                results.append({
                    'index': i,
                    'artist': track_info['artist'],
                    'title': track_info['title'],
                    'song_id': song_id,
                    'play_count': int(play_count)
                })
        
        df_result = pd.DataFrame(results)
        logger.info(f"Top-{n} généré avec {len(df_result)} tracks")
        return df_result

class GenreRecommender:
    """Recommandeur basé sur les genres"""
    
    def __init__(self, tracks_df, triplets_df, genre_df):
        self.tracks_df = tracks_df
        self.triplets_df = triplets_df
        self.genre_df = genre_df
    
    def get_top_by_genre(self, genre, n=100):
        """Retourne les n tracks les plus populaires pour un genre"""
        logger.info(f"Génération du top-{n} pour le genre {genre}...")
        
        # Filtrer par genre
        genre_tracks = self.genre_df[
            self.genre_df['majority_genre'] == genre
        ]['track_id'].values
        
        # Mapper track_id vers song_id
        genre_songs = self.tracks_df[
            self.tracks_df['track_id'].isin(genre_tracks)
        ]['song_id'].values
        
        # Calculer popularité dans ce genre
        genre_triplets = self.triplets_df[
            self.triplets_df['song_id'].isin(genre_songs)
        ]
        
        if genre_triplets.empty:
            logger.warning(f"Aucune chanson trouvée pour le genre {genre}")
            return pd.DataFrame()
        
        popularity = genre_triplets.groupby('song_id')['play_count'].sum().sort_values(ascending=False)
        top_songs = popularity.head(n)
        
        results = []
        for i, (song_id, play_count) in enumerate(top_songs.items(), 1):
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            if not track_info.empty:
                track_info = track_info.iloc[0]
                results.append({
                    'index': i,
                    'genre': genre,
                    'artist': track_info['artist'],
                    'title': track_info['title'],
                    'song_id': song_id,
                    'play_count': int(play_count)
                })
        
        df_result = pd.DataFrame(results)
        logger.info(f"Top-{n} {genre} généré avec {len(df_result)} tracks")
        return df_result

def main():
    """Fonction principale pour tester le système"""
    
    # Configuration
    DATA_PATH = "."  # Ajustez selon votre structure de dossiers
    
    try:
        # 1. Chargement des données
        logger.info("=== PHASE 1: CHARGEMENT DES DONNÉES ===")
        data_loader = DataLoader(DATA_PATH)
        
        triplets_df = data_loader.load_triplets()
        tracks_df = data_loader.load_tracks()
        lyrics_df, word_mapping = data_loader.load_lyrics()
        genre_df = data_loader.load_genres()
        
        # 2. Preprocessing
        logger.info("=== PHASE 2: PREPROCESSING ===")
        preprocessor = DataPreprocessor()
        preprocessor.create_mappings(triplets_df)
        user_item_matrix = preprocessor.build_user_item_matrix(triplets_df)
        
        # 3. Test des recommandeurs non-personnalisés
        logger.info("=== PHASE 3: RECOMMANDATIONS NON-PERSONNALISÉES ===")
        
        # Top-250 tracks
        popularity_rec = PopularityRecommender(tracks_df, triplets_df)
        top_250 = popularity_rec.get_top_tracks(250)
        print("Top 10 des tracks les plus populaires:")
        print(top_250.head(10)[['index', 'artist', 'title', 'play_count']])
        print()
        
        # Top-100 par genre
        genre_rec = GenreRecommender(tracks_df, triplets_df, genre_df)
        available_genres = genre_df['majority_genre'].unique()
        print(f"Genres disponibles: {available_genres}")
        
        # Test avec le genre "Rock"
        if 'Rock' in available_genres:
            top_rock = genre_rec.get_top_by_genre('Rock', 10)
            print("Top 10 Rock:")
            print(top_rock[['index', 'artist', 'title', 'play_count']])
        
        # 4. Statistiques générales
        logger.info("=== STATISTIQUES GÉNÉRALES ===")
        print(f"Nombre total d'utilisateurs: {len(preprocessor.user_to_idx)}")
        print(f"Nombre total de chansons: {len(preprocessor.song_to_idx)}")
        print(f"Nombre total d'interactions: {len(triplets_df)}")
        print(f"Sparsité de la matrice: {1 - user_item_matrix.nnz / np.prod(user_item_matrix.shape):.6f}")
        
        # 5. Sauvegarde des résultats
        logger.info("=== SAUVEGARDE ===")
        top_250.to_csv('top_250_tracks.csv', index=False)
        logger.info("Résultats sauvegardés dans top_250_tracks.csv")
        
    except Exception as e:
        logger.error(f"Erreur lors de l'exécution: {e}")
        raise

if __name__ == "__main__":
    main()

2025-08-23 21:02:42,117 - INFO - === PHASE 1: CHARGEMENT DES DONNÉES ===
2025-08-23 21:02:42,120 - INFO - Chargement des triplets depuis train_triplets.txt...
2025-08-23 21:03:01,580 - INFO - Triplets chargés: 48373586 interactions
2025-08-23 21:03:01,588 - INFO - Chargement des tracks depuis p02_unique_tracks.txt...
2025-08-23 21:03:03,315 - INFO - Tracks chargés: 1000000 pistes
2025-08-23 21:03:03,315 - INFO - Chargement des paroles depuis mxm_dataset_train.txt...
2025-08-23 21:03:07,239 - INFO - Paroles chargées: 210519 pistes avec paroles
2025-08-23 21:03:07,256 - INFO - Chargement des genres depuis p02_msd_tagtraum_cd2.cls...
2025-08-23 21:03:07,480 - INFO - Genres chargés: 280831 pistes avec genres
2025-08-23 21:03:07,494 - INFO - === PHASE 2: PREPROCESSING ===
2025-08-23 21:03:07,495 - INFO - Création des mappings user/song...
2025-08-23 21:03:17,164 - INFO - Mappings créés: 1019318 users, 384546 songs
2025-08-23 21:03:17,177 - INFO - Construction de la matrice user-item...
2025

Top 10 des tracks les plus populaires:
   index                                             artist  \
0      1                                      Dwight Yoakam   
1      2                                              Björk   
2      3                                      Kings Of Leon   
3      4                                           Harmonia   
4      5  Barry Tuckwell/Academy of St Martin-in-the-Fie...   
5      6                             Florence + The Machine   
6      7                                        OneRepublic   
7      8                                   Five Iron Frenzy   
8      9                                           Tub Ring   
9     10                                          Sam Cooke   

                                               title  play_count  
0                                     You're The One      726885  
1                                               Undo      648239  
2                                            Revelry      527893  

2025-08-23 21:03:46,035 - INFO - Top-10 Rock généré avec 10 tracks
2025-08-23 21:03:46,637 - INFO - === STATISTIQUES GÉNÉRALES ===
2025-08-23 21:03:46,637 - INFO - === SAUVEGARDE ===
2025-08-23 21:03:46,649 - INFO - Résultats sauvegardés dans top_250_tracks.csv


Top 10 Rock:
   index         artist                               title  play_count
0      1          Björk                                Undo      648239
1      2  Kings Of Leon                             Revelry      527893
2      3       Harmonia                       Sehr kosmisch      425463
3      4    OneRepublic                             Secrets      292642
4      5       Tub Ring                             Invalid      268353
5      6  Kings Of Leon                        Use Somebody      145725
6      7     The Crests                          16 Candles      129069
7      8       Coldplay                              Clocks      114362
8      9       Coldplay                              Yellow      109566
9     10       Paramore  The Only Exception (Album Version)      103653
Nombre total d'utilisateurs: 1019318
Nombre total de chansons: 384546
Nombre total d'interactions: 48373586
Sparsité de la matrice: 0.999877


In [5]:
# test_collaborative_filtering.py - Test du Collaborative Filtering
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import pandas as pd

class UserBasedCF:
    """Collaborative Filtering basé utilisateur simplifié"""
    
    def __init__(self, user_item_matrix, preprocessor):
        self.user_item_matrix = user_item_matrix
        self.preprocessor = preprocessor
        self.user_similarity_matrix = None
    
    def compute_user_similarity(self, n_users=1000):
        """Calcule similarité pour un sous-ensemble d'utilisateurs"""
        print(f"Calcul de similarité pour {n_users} premiers utilisateurs...")
        
        # Utiliser seulement un sous-ensemble pour éviter les problèmes mémoire
        subset_matrix = self.user_item_matrix[:n_users, :]
        
        # Normalisation L2 pour cosine similarity
        from sklearn.preprocessing import normalize
        normalized_matrix = normalize(subset_matrix, norm='l2', axis=1)
        
        # Calcul de similarité
        self.user_similarity_matrix = cosine_similarity(normalized_matrix)
        print(f"Matrice de similarité calculée: {self.user_similarity_matrix.shape}")
        
        return self.user_similarity_matrix
    
    def recommend_for_user(self, user_idx, n_recommendations=10, n_neighbors=20):
        """Recommandations pour un utilisateur donné"""
        
        if self.user_similarity_matrix is None:
            print("Calcul de similarité requis...")
            self.compute_user_similarity()
        
        if user_idx >= self.user_similarity_matrix.shape[0]:
            print(f"User index {user_idx} hors limite")
            return pd.DataFrame()
        
        # Trouver utilisateurs similaires
        user_similarities = self.user_similarity_matrix[user_idx]
        # Exclure l'utilisateur lui-même
        user_similarities[user_idx] = -1
        
        # Top utilisateurs similaires
        similar_users = np.argsort(user_similarities)[::-1][:n_neighbors]
        
        # Items de l'utilisateur cible
        user_items = self.user_item_matrix[user_idx].toarray()[0]
        recommendations = np.zeros(self.user_item_matrix.shape[1])
        
        # Calculer scores de recommandation
        for similar_user_idx in similar_users:
            if user_similarities[similar_user_idx] > 0:  # Similarité positive
                similarity = user_similarities[similar_user_idx]
                similar_user_items = self.user_item_matrix[similar_user_idx].toarray()[0]
                
                # Items non vus par l'utilisateur cible
                unseen_items = (user_items == 0) & (similar_user_items > 0)
                recommendations[unseen_items] += similarity * similar_user_items[unseen_items]
        
        # Sélectionner top recommendations
        if recommendations.sum() == 0:
            print("Aucune recommandation trouvée")
            return pd.DataFrame()
        
        top_items_idx = np.argsort(recommendations)[::-1][:n_recommendations]
        top_scores = recommendations[top_items_idx]
        
        # Convertir en song_ids
        results = []
        for i, (item_idx, score) in enumerate(zip(top_items_idx, top_scores)):
            if score > 0:
                song_id = self.preprocessor.idx_to_song.get(item_idx, 'Unknown')
                results.append({
                    'rank': i + 1,
                    'song_id': song_id,
                    'score': float(score),
                    'item_idx': int(item_idx)
                })
        
        return pd.DataFrame(results)

def test_collaborative_filtering():
    """Test du système de collaborative filtering"""
    
    print("=== TEST COLLABORATIVE FILTERING ===")
    
    # Charger un petit échantillon de données pour test
    print("Chargement des données...")
    try:
        # Lire seulement les premières lignes pour test rapide
        triplets_df = pd.read_csv('train_triplets.txt', sep='\t', 
                                 names=['user_id', 'song_id', 'play_count'],
                                 nrows=10000)  # Limiter pour test
        
        tracks_df = pd.read_csv('p02_unique_tracks.txt', sep='<SEP>', 
                               names=['track_id', 'song_id', 'artist', 'title'],
                               engine='python', nrows=5000)
        
        print(f"Données chargées: {len(triplets_df)} interactions")
        
        # Créer mappings
        unique_users = triplets_df['user_id'].unique()
        unique_songs = triplets_df['song_id'].unique()
        
        user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
        song_to_idx = {song: idx for idx, song in enumerate(unique_songs)}
        idx_to_user = {idx: user for user, idx in user_to_idx.items()}
        idx_to_song = {idx: song for song, idx in song_to_idx.items()}
        
        class SimplePreprocessor:
            def __init__(self):
                self.user_to_idx = user_to_idx
                self.song_to_idx = song_to_idx
                self.idx_to_user = idx_to_user
                self.idx_to_song = idx_to_song
        
        preprocessor = SimplePreprocessor()
        
        # Construire matrice user-item
        user_indices = triplets_df['user_id'].map(user_to_idx)
        song_indices = triplets_df['song_id'].map(song_to_idx)
        play_counts = triplets_df['play_count']
        
        user_item_matrix = csr_matrix(
            (play_counts, (user_indices, song_indices)),
            shape=(len(user_to_idx), len(song_to_idx))
        )
        
        print(f"Matrice construite: {user_item_matrix.shape}")
        print(f"Sparsité: {1 - user_item_matrix.nnz / np.prod(user_item_matrix.shape):.4f}")
        
        # Tester collaborative filtering
        cf_recommender = UserBasedCF(user_item_matrix, preprocessor)
        
        # Test pour le premier utilisateur
        user_idx = 0
        user_id = idx_to_user[user_idx]
        print(f"\nTest pour l'utilisateur: {user_id}")
        
        # Voir ce que l'utilisateur a écouté
        user_songs = user_item_matrix[user_idx].nonzero()[1]
        print(f"L'utilisateur a écouté {len(user_songs)} chansons différentes")
        
        # Générer recommandations
        recommendations = cf_recommender.recommend_for_user(user_idx, n_recommendations=5)
        
        if not recommendations.empty:
            print("\nRecommandations générées:")
            for _, rec in recommendations.iterrows():
                song_id = rec['song_id']
                # Essayer de trouver les infos du track
                track_info = tracks_df[tracks_df['song_id'] == song_id]
                if not track_info.empty:
                    artist = track_info.iloc[0]['artist']
                    title = track_info.iloc[0]['title']
                    print(f"{rec['rank']}. {artist} - {title} (score: {rec['score']:.3f})")
                else:
                    print(f"{rec['rank']}. {song_id} (score: {rec['score']:.3f})")
        else:
            print("Aucune recommandation générée")
        
        print("\n✅ Test collaborative filtering terminé avec succès!")
        
    except Exception as e:
        print(f"❌ Erreur lors du test: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    test_collaborative_filtering()

=== TEST COLLABORATIVE FILTERING ===
Chargement des données...
Données chargées: 10000 interactions
Matrice construite: (173, 7849)
Sparsité: 0.9926

Test pour l'utilisateur: b80344d063b5ccb3212f76538f3d9e43d87dca9e
L'utilisateur a écouté 104 chansons différentes
Calcul de similarité requis...
Calcul de similarité pour 1000 premiers utilisateurs...
Matrice de similarité calculée: (173, 173)

Recommandations générées:
1. SOAUWYT12A81C206F1 (score: 2.456)
2. SOPUCYA12A8C13A694 (score: 1.214)
3. SOQZYQH12A8AE468E5 (score: 0.903)
4. SOTLEJN12A8C13E8EF (score: 0.815)
5. SOPATZX12A8AE46295 (score: 0.815)

✅ Test collaborative filtering terminé avec succès!


In [8]:
# complete_myspotify.py - Système complet de recommandations
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import logging
from pathlib import Path
import pickle
import os

# Configuration du logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MySpotifyComplete:
    """Système complet de recommandation musicale MySpotify"""
    
    def __init__(self, data_path="."):
        self.data_path = Path(data_path)
        
        # Données
        self.triplets_df = None
        self.tracks_df = None
        self.lyrics_df = None
        self.word_mapping = None
        self.genre_df = None
        
        # Preprocessor
        self.user_to_idx = {}
        self.song_to_idx = {}
        self.idx_to_user = {}
        self.idx_to_song = {}
        self.user_item_matrix = None
        
        # Résultats
        self.results = {}
    
    def load_all_data(self):
        """Charge toutes les données nécessaires"""
        logger.info("=== CHARGEMENT DE TOUTES LES DONNÉES ===")
        
        # Triplets
        logger.info("Chargement des triplets...")
        self.triplets_df = pd.read_csv(
            self.data_path / "train_triplets.txt", 
            sep='\t', names=['user_id', 'song_id', 'play_count']
        )
        logger.info(f"✅ Triplets: {len(self.triplets_df)} interactions")
        
        # Tracks
        logger.info("Chargement des tracks...")
        self.tracks_df = pd.read_csv(
            self.data_path / "p02_unique_tracks.txt", 
            sep='<SEP>', names=['track_id', 'song_id', 'artist', 'title'],
            engine='python'
        )
        logger.info(f"✅ Tracks: {len(self.tracks_df)} pistes")
        
        # Paroles
        logger.info("Chargement des paroles...")
        self.lyrics_df, self.word_mapping = self._load_lyrics()
        logger.info(f"✅ Paroles: {len(self.lyrics_df)} pistes avec paroles")
        
        # Genres
        logger.info("Chargement des genres...")
        self.genre_df = self._load_genres()
        logger.info(f"✅ Genres: {len(self.genre_df)} pistes avec genres")
        
        # Preprocessing
        self._create_mappings()
        self._build_user_item_matrix()
    
    def _load_lyrics(self):
        """Charge les données de paroles"""
        lyrics_data = []
        word_mapping = {}
        
        with open(self.data_path / "mxm_dataset_train.txt", 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                
                if line.startswith('#'):
                    continue
                
                if line.startswith('%'):
                    words = line[1:].split(',')
                    word_mapping = {i+1: word for i, word in enumerate(words)}
                    continue
                
                if line:
                    parts = line.split(',')
                    if len(parts) >= 2:
                        track_id = parts[0]
                        mxm_track_id = parts[1]
                        
                        word_counts = {}
                        for part in parts[2:]:
                            if ':' in part:
                                try:
                                    word_idx, count = part.split(':')
                                    word_counts[int(word_idx)] = int(count)
                                except ValueError:
                                    continue
                        
                        lyrics_data.append({
                            'track_id': track_id,
                            'mxm_track_id': mxm_track_id,
                            'word_counts': word_counts
                        })
        
        return pd.DataFrame(lyrics_data), word_mapping
    
    def _load_genres(self):
        """Charge les données de genres"""
        genres_data = []
        with open(self.data_path / "p02_msd_tagtraum_cd2.cls", 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#'):
                    parts = line.split('\t')
                    if len(parts) >= 2:
                        track_id = parts[0]
                        majority_genre = parts[1]
                        minority_genre = parts[2] if len(parts) > 2 else None
                        
                        genres_data.append({
                            'track_id': track_id,
                            'majority_genre': majority_genre,
                            'minority_genre': minority_genre
                        })
        
        return pd.DataFrame(genres_data)
    
    def _create_mappings(self):
        """Crée les mappings bidirectionnels"""
        logger.info("Création des mappings...")
        
        unique_users = self.triplets_df['user_id'].unique()
        unique_songs = self.triplets_df['song_id'].unique()
        
        self.user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
        self.song_to_idx = {song: idx for idx, song in enumerate(unique_songs)}
        self.idx_to_user = {idx: user for user, idx in self.user_to_idx.items()}
        self.idx_to_song = {idx: song for song, idx in self.song_to_idx.items()}
        
        logger.info(f"✅ Mappings: {len(self.user_to_idx)} users, {len(self.song_to_idx)} songs")
    
    def _build_user_item_matrix(self):
        """Construit la matrice user-item"""
        logger.info("Construction de la matrice user-item...")
        
        user_indices = self.triplets_df['user_id'].map(self.user_to_idx)
        song_indices = self.triplets_df['song_id'].map(self.song_to_idx)
        play_counts = self.triplets_df['play_count']
        
        self.user_item_matrix = csr_matrix(
            (play_counts, (user_indices, song_indices)),
            shape=(len(self.user_to_idx), len(self.song_to_idx))
        )
        
        sparsity = 1 - self.user_item_matrix.nnz / np.prod(self.user_item_matrix.shape)
        logger.info(f"✅ Matrice: {self.user_item_matrix.shape}, sparsité: {sparsity:.6f}")
    
    def generate_top_250_tracks(self):
        """1. Top-250 tracks (Non-personnalisé)"""
        logger.info("=== 1. TOP-250 TRACKS ===")
        
        popularity = self.triplets_df.groupby('song_id')['play_count'].sum().sort_values(ascending=False)
        top_songs = popularity.head(250)
        
        results = []
        for i, (song_id, play_count) in enumerate(top_songs.items(), 1):
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            if not track_info.empty:
                track_info = track_info.iloc[0]
                results.append({
                    'rank': i,
                    'artist': track_info['artist'],
                    'title': track_info['title'],
                    'song_id': song_id,
                    'play_count': int(play_count)
                })
        
        df_result = pd.DataFrame(results)
        df_result.to_csv('results/top_250_tracks.csv', index=False)
        self.results['top_250'] = df_result
        logger.info(f"✅ Top-250 généré: {len(df_result)} tracks")
        return df_result
    
    def generate_top_by_genre(self):
        """2. Top-100 par genre (Non-personnalisé)"""
        logger.info("=== 2. TOP-100 PAR GENRE ===")
        
        available_genres = self.genre_df['majority_genre'].unique()
        target_genres = ['Rock', 'Rap', 'Jazz', 'Electronic', 'Pop', 'Blues', 'Country', 'Reggae']
        
        all_genre_results = {}
        
        for genre in target_genres:
            if genre in available_genres:
                logger.info(f"Génération top-100 {genre}...")
                
                # Filtrer par genre
                genre_tracks = self.genre_df[
                    self.genre_df['majority_genre'] == genre
                ]['track_id'].values
                
                genre_songs = self.tracks_df[
                    self.tracks_df['track_id'].isin(genre_tracks)
                ]['song_id'].values
                
                genre_triplets = self.triplets_df[
                    self.triplets_df['song_id'].isin(genre_songs)
                ]
                
                if not genre_triplets.empty:
                    popularity = genre_triplets.groupby('song_id')['play_count'].sum().sort_values(ascending=False)
                    top_songs = popularity.head(100)
                    
                    results = []
                    for i, (song_id, play_count) in enumerate(top_songs.items(), 1):
                        track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
                        if not track_info.empty:
                            track_info = track_info.iloc[0]
                            results.append({
                                'rank': i,
                                'genre': genre,
                                'artist': track_info['artist'],
                                'title': track_info['title'],
                                'song_id': song_id,
                                'play_count': int(play_count)
                            })
                    
                    df_result = pd.DataFrame(results)
                    df_result.to_csv(f'results/top_100_{genre.lower()}.csv', index=False)
                    all_genre_results[genre] = df_result
                    logger.info(f"✅ {genre}: {len(df_result)} tracks")
        
        self.results['genres'] = all_genre_results
        return all_genre_results
    
    def generate_thematic_collections(self):
        """Collections thématiques - Version optimisée pour mémoire limitée"""
        logger.info("=== COLLECTIONS THÉMATIQUES ===")
        
        # 1. Configuration des thèmes (garder minimal)
        themes = {
            'love': ['love', 'heart'],  # Réduire le nombre de mots-clés
            'war': ['war', 'fight'],
            'happiness': ['happy', 'joy']
        }
        
        # 2. Lecture du fichier par chunks pour économiser la mémoire
        chunk_size = 10000  # Ajuster selon la RAM disponible
        collections = {}
        
        for theme, keywords in themes.items():
            logger.info(f"Génération collection '{theme}'...")
            theme_scores = []
            
            # Créer index inversé pour les mots-clés
            keyword_indices = set()
            for keyword in keywords:
                for idx, word in self.word_mapping.items():
                    if keyword in word.lower():
                        keyword_indices.add(idx)
            
            # Traiter les paroles par chunks
            for chunk_start in range(0, len(self.lyrics_df), chunk_size):
                chunk_end = min(chunk_start + chunk_size, len(self.lyrics_df))
                chunk = self.lyrics_df.iloc[chunk_start:chunk_end]
                
                for _, row in chunk.iterrows():
                    word_counts = row['word_counts']
                    
                    # Calcul score simplifié
                    theme_score = sum(word_counts.get(idx, 0) for idx in keyword_indices)
                    
                    if theme_score > 0:
                        theme_scores.append({
                            'track_id': row['track_id'],
                            'score': theme_score
                        })
                
                # Libérer la mémoire
                del chunk
            
            # Prendre uniquement les 50 meilleurs scores
            theme_scores.sort(key=lambda x: x['score'], reverse=True)
            top_50 = theme_scores[:50]
            
            # Formatage des résultats
            results = []
            for rank, item in enumerate(top_50, 1):
                track_info = self.tracks_df[self.tracks_df['track_id'] == item['track_id']]
                if not track_info.empty:
                    track_info = track_info.iloc[0]
                    results.append({
                        'rank': rank,
                        'theme': theme,
                        'artist': track_info['artist'],
                        'title': track_info['title'],
                        'score': item['score']
                    })
            
            if results:
                df_result = pd.DataFrame(results)
                df_result.to_csv(f'results/collection_{theme}.csv', index=False)
                collections[theme] = df_result
                logger.info(f"✅ Collection '{theme}': {len(df_result)} tracks")
                
            # Libérer la mémoire
            del theme_scores
        
        return collections
    
    def generate_user_based_recommendations(self, n_users=100):
        """4. People similar to you listen (User-based CF)"""
        logger.info("=== 4. USER-BASED COLLABORATIVE FILTERING ===")
        
        # Utiliser un sous-ensemble d'utilisateurs pour éviter les problèmes mémoire
        logger.info(f"Calcul pour les {n_users} premiers utilisateurs...")
        
        subset_matrix = self.user_item_matrix[:n_users, :]
        normalized_matrix = normalize(subset_matrix, norm='l2', axis=1)
        
        # Calculer similarité
        user_similarity = cosine_similarity(normalized_matrix)
        
        user_recommendations = []
        
        for user_idx in range(min(20, n_users)):  # Test sur 20 premiers utilisateurs
            user_id = self.idx_to_user[user_idx]
            
            # Trouver utilisateurs similaires
            similarities = user_similarity[user_idx]
            similarities[user_idx] = -1  # Exclure l'utilisateur lui-même
            
            similar_users = np.argsort(similarities)[::-1][:20]  # Top 20 similaires
            
            # Générer recommandations
            user_items = subset_matrix[user_idx].toarray()[0]
            recommendations = np.zeros(subset_matrix.shape[1])
            
            for similar_user_idx in similar_users:
                if similarities[similar_user_idx] > 0:
                    sim_score = similarities[similar_user_idx]
                    similar_items = subset_matrix[similar_user_idx].toarray()[0]
                    
                    unseen_items = (user_items == 0) & (similar_items > 0)
                    recommendations[unseen_items] += sim_score * similar_items[unseen_items]
            
            # Sélectionner top 10
            if recommendations.sum() > 0:
                top_items = np.argsort(recommendations)[::-1][:10]
                
                for rank, item_idx in enumerate(top_items, 1):
                    if recommendations[item_idx] > 0:
                        song_id = self.idx_to_song.get(item_idx, 'Unknown')
                        track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
                        
                        if not track_info.empty:
                            track_info = track_info.iloc[0]
                            user_recommendations.append({
                                'user_id': user_id,
                                'rank': rank,
                                'artist': track_info['artist'],
                                'title': track_info['title'],
                                'song_id': song_id,
                                'score': float(recommendations[item_idx])
                            })
        
        if user_recommendations:
            df_result = pd.DataFrame(user_recommendations)
            df_result.to_csv('results/user_based_recommendations.csv', index=False)
            self.results['user_cf'] = df_result
            logger.info(f"✅ Recommandations user-based: {len(df_result)} recs pour {df_result['user_id'].nunique()} users")
        
        return user_recommendations
    
    def generate_item_based_recommendations(self, n_items=100):
        """5. People who listen to this track usually listen (Item-based CF)"""
        logger.info("=== 5. ITEM-BASED COLLABORATIVE FILTERING ===")
        
        # Utiliser un sous-ensemble d'items
        subset_matrix = self.user_item_matrix[:, :n_items]
        
        # Transposer pour avoir items x users
        item_user_matrix = subset_matrix.T
        normalized_matrix = normalize(item_user_matrix, norm='l2', axis=1)
        
        # Calculer similarité entre items
        item_similarity = cosine_similarity(normalized_matrix)
        
        item_recommendations = []
        
        # Test sur quelques items populaires
        popular_items = np.array(subset_matrix.sum(axis=0)).flatten()
        top_items_idx = np.argsort(popular_items)[::-1][:20]  # 20 items les plus populaires
        
        for item_idx in top_items_idx:
            if item_idx < item_similarity.shape[0]:
                song_id = self.idx_to_song.get(item_idx, 'Unknown')
                
                # Trouver items similaires
                similarities = item_similarity[item_idx]
                similarities[item_idx] = -1  # Exclure l'item lui-même
                
                similar_items = np.argsort(similarities)[::-1][:10]
                
                for rank, similar_idx in enumerate(similar_items, 1):
                    if similarities[similar_idx] > 0.1:  # Seuil de similarité
                        similar_song_id = self.idx_to_song.get(similar_idx, 'Unknown')
                        track_info = self.tracks_df[self.tracks_df['song_id'] == similar_song_id]
                        
                        if not track_info.empty:
                            track_info = track_info.iloc[0]
                            item_recommendations.append({
                                'seed_song_id': song_id,
                                'rank': rank,
                                'artist': track_info['artist'],
                                'title': track_info['title'],
                                'recommended_song_id': similar_song_id,
                                'similarity': float(similarities[similar_idx])
                            })
        
        if item_recommendations:
            df_result = pd.DataFrame(item_recommendations)
            df_result.to_csv('results/item_based_recommendations.csv', index=False)
            self.results['item_cf'] = df_result
            logger.info(f"✅ Recommandations item-based: {len(df_result)} recs")
        
        return item_recommendations
    
    def run_complete_system(self):
        """Exécute le système complet de recommandations"""
        
        # Créer dossier results
        os.makedirs('results', exist_ok=True)
        
        # Charger toutes les données
        self.load_all_data()
        
        # Générer toutes les recommandations
        logger.info("\n🎵 GÉNÉRATION DE TOUTES LES RECOMMANDATIONS 🎵")
        
        self.generate_top_250_tracks()
        self.generate_top_by_genre()
        self.generate_thematic_collections()
        self.generate_user_based_recommendations()
        self.generate_item_based_recommendations()
        
        # Résumé final
        logger.info("=== RÉSUMÉ FINAL ===")
        logger.info("✅ Fichiers générés dans le dossier 'results/':")
        
        for file in Path('results').glob('*.csv'):
            size_kb = file.stat().st_size / 1024
            logger.info(f"   📄 {file.name} ({size_kb:.1f} KB)")
        
        logger.info("\n🎉 SYSTÈME MYSPOTIFY COMPLET EXÉCUTÉ AVEC SUCCÈS! 🎉")
        
        return self.results

def main():
    """Fonction principale"""
    try:
        myspotify = MySpotifyComplete()
        results = myspotify.run_complete_system()
        
        print("\n" + "="*50)
        print("🎵 MYSPOTIFY - SYSTÈME DE RECOMMANDATION MUSICAL 🎵")
        print("="*50)
        print(f"✅ Top-250 tracks générés")
        print(f"✅ {len(results.get('genres', {}))} genres traités")
        print(f"✅ {len(results.get('collections', {}))} collections thématiques")
        print(f"✅ Collaborative filtering utilisateur et item")
        print("\n📁 Tous les résultats sont dans le dossier 'results/'")
        
    except Exception as e:
        logger.error(f"❌ Erreur lors de l'exécution: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

2025-08-23 21:56:54,439 - INFO - === CHARGEMENT DE TOUTES LES DONNÉES ===
2025-08-23 21:56:54,440 - INFO - Chargement des triplets...
2025-08-23 21:57:12,455 - INFO - ✅ Triplets: 48373586 interactions
2025-08-23 21:57:12,463 - INFO - Chargement des tracks...
2025-08-23 21:57:23,155 - INFO - ✅ Tracks: 1000000 pistes
2025-08-23 21:57:23,156 - INFO - Chargement des paroles...
2025-08-23 21:57:27,033 - INFO - ✅ Paroles: 210519 pistes avec paroles
2025-08-23 21:57:27,034 - INFO - Chargement des genres...
2025-08-23 21:57:27,236 - INFO - ✅ Genres: 280831 pistes avec genres
2025-08-23 21:57:27,237 - INFO - Création des mappings...
2025-08-23 21:57:33,008 - INFO - ✅ Mappings: 1019318 users, 384546 songs
2025-08-23 21:57:33,016 - INFO - Construction de la matrice user-item...
2025-08-23 21:57:39,903 - INFO - ✅ Matrice: (1019318, 384546), sparsité: 0.999877
2025-08-23 21:57:39,915 - INFO - 
🎵 GÉNÉRATION DE TOUTES LES RECOMMANDATIONS 🎵
2025-08-23 21:57:39,915 - INFO - === 1. TOP-250 TRACKS ===
20


🎵 MYSPOTIFY - SYSTÈME DE RECOMMANDATION MUSICAL 🎵
✅ Top-250 tracks générés
✅ 8 genres traités
✅ 0 collections thématiques
✅ Collaborative filtering utilisateur et item

📁 Tous les résultats sont dans le dossier 'results/'


In [15]:
# simple_analyzer.py - Version simplifiée de l'analyseur MySpotify
import pandas as pd
import os
from pathlib import Path

def analyze_myspotify_results():
    """Analyse simple des résultats MySpotify"""
    
    results_path = Path("results")
    
    if not results_path.exists():
        print("❌ Dossier 'results' non trouvé.")
        print("   Exécutez d'abord: python complete_myspotify.py")
        return
    
    print("="*60)
    print("🎵 ANALYSE DES RÉSULTATS MYSPOTIFY 🎵")
    print("="*60)
    
    # Lister tous les fichiers CSV
    csv_files = list(results_path.glob("*.csv"))
    
    if not csv_files:
        print("❌ Aucun fichier CSV trouvé dans le dossier results/")
        return
    
    print(f"📁 Fichiers trouvés: {len(csv_files)}")
    print()
    
    total_recommendations = 0
    components_found = []
    
    # Analyser chaque fichier
    for file in sorted(csv_files):
        try:
            df = pd.read_csv(file)
            file_size_kb = file.stat().st_size / 1024
            
            print(f"📄 {file.name}")
            print(f"   • Taille: {file_size_kb:.1f} KB")
            print(f"   • Lignes: {len(df)}")
            print(f"   • Colonnes: {list(df.columns)}")
            
            # Échantillon des données
            if len(df) > 0:
                if 'artist' in df.columns and 'title' in df.columns:
                    print(f"   • Exemple: {df.iloc[0]['artist']} - {df.iloc[0]['title']}")
                total_recommendations += len(df)
            
            # Identifier le type de composant
            if file.name == "top_250_tracks.csv":
                components_found.append("✅ Top-250 tracks")
            elif file.name.startswith("top_100_"):
                genre = file.name.replace("top_100_", "").replace(".csv", "").title()
                components_found.append(f"✅ Top-100 {genre}")
            elif file.name.startswith("collection_"):
                theme = file.name.replace("collection_", "").replace(".csv", "").title()
                components_found.append(f"✅ Collection {theme}")
            elif file.name == "user_based_recommendations.csv":
                components_found.append("✅ User-based Collaborative Filtering")
            elif file.name == "item_based_recommendations.csv":
                components_found.append("✅ Item-based Collaborative Filtering")
            
            print()
            
        except Exception as e:
            print(f"❌ Erreur lecture {file.name}: {e}")
            print()
    
    # Résumé final
    print("="*60)
    print("📊 RÉSUMÉ FINAL")
    print("="*60)
    print(f"🎯 Total des recommandations: {total_recommendations:,}")
    print(f"📋 Composants implémentés: {len(components_found)}")
    print()
    
    print("🎵 Composants trouvés:")
    for component in components_found:
        print(f"   {component}")
    
    # Vérification des exigences projet
    print()
    print("📋 VÉRIFICATION EXIGENCES PROJET:")
    
    required_files = {
        "top_250_tracks.csv": "Top-250 tracks",
        "user_based_recommendations.csv": "User-based CF", 
        "item_based_recommendations.csv": "Item-based CF"
    }
    
    genre_files = [f for f in csv_files if f.name.startswith("top_100_")]
    collection_files = [f for f in csv_files if f.name.startswith("collection_")]
    
    # Vérifications
    all_good = True
    
    for req_file, description in required_files.items():
        if any(f.name == req_file for f in csv_files):
            print(f"   ✅ {description}")
        else:
            print(f"   ❌ {description} - MANQUANT")
            all_good = False
    
    if len(genre_files) > 0:
        print(f"   ✅ Recommandations par genre ({len(genre_files)} genres)")
    else:
        print(f"   ❌ Recommandations par genre - MANQUANT")
        all_good = False
    
    if len(collection_files) > 0:
        print(f"   ✅ Collections thématiques ({len(collection_files)} thèmes)")
    else:
        print(f"   ❌ Collections thématiques - MANQUANT")
        all_good = False
    
    print()
    if all_good:
        print("🎉 FÉLICITATIONS! Tous les composants requis sont présents!")
        print("✅ Votre projet MySpotify est COMPLET!")
    else:
        print("⚠️  Certains composants manquent.")
        print("   Exécutez: python complete_myspotify.py")
    
    print("="*60)

def show_sample_recommendations():
    """Affiche des échantillons de recommandations"""
    
    print("\n🎵 ÉCHANTILLONS DE RECOMMANDATIONS")
    print("="*50)
    
    results_path = Path("results")
    
    # Top-250
    top_250_file = results_path / "top_250_tracks.csv"
    if top_250_file.exists():
        df = pd.read_csv(top_250_file)
        print("\n🏆 TOP 5 TRACKS LES PLUS POPULAIRES:")
        print("-" * 40)
        for i, row in df.head(5).iterrows():
            print(f"{row['rank']:2d}. {row['artist']} - {row['title']}")
            print(f"    Play count: {row['play_count']:,}")
    
    # Collection amour
    love_file = results_path / "collection_love.csv"
    if love_file.exists():
        df = pd.read_csv(love_file)
        print(f"\n💖 TOP 5 CHANSONS D'AMOUR:")
        print("-" * 40)
        for i, row in df.head(5).iterrows():
            print(f"{row['rank']:2d}. {row['artist']} - {row['title']}")
            if 'theme_score' in row:
                print(f"    Score thématique: {row['theme_score']}")
    
    # Rock
    rock_file = results_path / "top_100_rock.csv"
    if rock_file.exists():
        df = pd.read_csv(rock_file)
        print(f"\n🎸 TOP 5 ROCK:")
        print("-" * 40)
        for i, row in df.head(5).iterrows():
            print(f"{row['rank']:2d}. {row['artist']} - {row['title']}")
            print(f"    Play count: {row['play_count']:,}")

def main():
    """Fonction principale"""
    try:
        analyze_myspotify_results()
        show_sample_recommendations()
        
        print(f"\n📁 Pour voir tous les détails, consultez le dossier 'results/'")
        
    except Exception as e:
        print(f"❌ Erreur: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

🎵 ANALYSE DES RÉSULTATS MYSPOTIFY 🎵
📁 Fichiers trouvés: 14

📄 collection_happiness.csv
   • Taille: 2.5 KB
   • Lignes: 50
   • Colonnes: ['rank', 'theme', 'artist', 'title', 'score']
   • Exemple: The Sundays - Joy

📄 collection_love.csv
   • Taille: 2.1 KB
   • Lignes: 50
   • Colonnes: ['rank', 'theme', 'artist', 'title', 'score']
   • Exemple: Jessica Simpson - I Think I'm In Love With You

📄 collection_war.csv
   • Taille: 1.9 KB
   • Lignes: 50
   • Colonnes: ['rank', 'theme', 'artist', 'title', 'score']
   • Exemple: Culture Club - The War Song (2003 Digital Remaster)

📄 item_based_recommendations.csv
   • Taille: 2.8 KB
   • Lignes: 32
   • Colonnes: ['seed_song_id', 'rank', 'artist', 'title', 'recommended_song_id', 'similarity']
   • Exemple: The String Cheese Incident - Bigger Isn't Better

📄 top_100_blues.csv
   • Taille: 6.4 KB
   • Lignes: 100
   • Colonnes: ['rank', 'genre', 'artist', 'title', 'song_id', 'play_count']
   • Exemple: Sonny Boy Williamson - Don't Start Me Ta

In [17]:
# myspotify_bonus.py - Fonctionnalités bonus inspirées des services de streaming
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from collections import Counter, defaultdict
import random
import logging
from pathlib import Path

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SpotifyInspiredFeatures:
    """Fonctionnalités bonus inspirées de Spotify, Apple Music, etc."""
    
    def __init__(self, data_path="."):
        self.data_path = Path(data_path)
        self.load_base_data()
    
    def load_base_data(self):
        """Charge les données de base"""
        logger.info("Chargement des données de base...")
        
        # Charger données existantes
        self.triplets_df = pd.read_csv(
            self.data_path / "train_triplets.txt",
            sep='\t', names=['user_id', 'song_id', 'play_count']
        )
        
        self.tracks_df = pd.read_csv(
            self.data_path / "p02_unique_tracks.txt",
            sep='<SEP>', names=['track_id', 'song_id', 'artist', 'title'],
            engine='python'
        )
        
        # Charger genres
        self.genre_df = self._load_genres()
        
        # Créer mappings
        self._create_mappings()
        
        logger.info("✅ Données de base chargées")
    
    def _load_genres(self):
        """Charge les genres"""
        genres_data = []
        try:
            with open(self.data_path / "p02_msd_tagtraum_cd2.cls", 'r') as f:
                for line in f:
                    line = line.strip()
                    if line and not line.startswith('#'):
                        parts = line.split('\t')
                        if len(parts) >= 2:
                            genres_data.append({
                                'track_id': parts[0],
                                'majority_genre': parts[1],
                                'minority_genre': parts[2] if len(parts) > 2 else None
                            })
        except:
            logger.warning("Impossible de charger les genres")
        
        return pd.DataFrame(genres_data)
    
    def _create_mappings(self):
        """Crée les mappings"""
        unique_users = self.triplets_df['user_id'].unique()
        unique_songs = self.triplets_df['song_id'].unique()
        
        self.user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
        self.song_to_idx = {song: idx for idx, song in enumerate(unique_songs)}
        self.idx_to_user = {idx: user for user, idx in self.user_to_idx.items()}
        self.idx_to_song = {idx: song for song, idx in self.song_to_idx.items()}
    
    def discover_weekly(self, user_id=None, n_tracks=30):
        """
        BONUS 1: "Discover Weekly" - Playlist de découverte personnalisée
        Inspiré de la fonctionnalité Spotify Discover Weekly
        """
        logger.info("=== BONUS 1: DISCOVER WEEKLY ===")
        
        if user_id is None:
            # Prendre un utilisateur au hasard
            user_id = random.choice(list(self.user_to_idx.keys()))
        
        if user_id not in self.user_to_idx:
            logger.warning(f"Utilisateur {user_id} non trouvé")
            return pd.DataFrame()
        
        logger.info(f"Génération Discover Weekly pour {user_id}...")
        
        # 1. Analyser le profil de l'utilisateur
        user_profile = self._analyze_user_profile(user_id)
        
        # 2. Recommandations hybrides
        recommendations = []
        
        # 40% - Collaborative Filtering (artistes similaires non écoutés)
        cf_recs = self._get_cf_discoveries(user_id, int(n_tracks * 0.4))
        recommendations.extend(cf_recs)
        
        # 30% - Expansion par genre (genres aimés, artistes non écoutés)
        genre_recs = self._get_genre_expansion(user_id, user_profile, int(n_tracks * 0.3))
        recommendations.extend(genre_recs)
        
        # 30% - Découverte pure (tendances récentes, diversité)
        discovery_recs = self._get_pure_discovery(user_id, int(n_tracks * 0.3))
        recommendations.extend(discovery_recs)
        
        # 3. Diversification et déduplication
        final_playlist = self._diversify_playlist(recommendations, n_tracks)
        
        # 4. Formatage final
        playlist = []
        for i, (song_id, score, reason) in enumerate(final_playlist, 1):
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            if not track_info.empty:
                track_info = track_info.iloc[0]
                playlist.append({
                    'rank': i,
                    'artist': track_info['artist'],
                    'title': track_info['title'],
                    'song_id': song_id,
                    'discovery_score': float(score),
                    'reason': reason
                })
        
        df_result = pd.DataFrame(playlist)
        df_result.to_csv(f'results/bonus_discover_weekly_{user_id[:8]}.csv', index=False)
        
        logger.info(f"✅ Discover Weekly généré: {len(df_result)} tracks")
        return df_result
    
    def artist_radio(self, seed_artist, n_tracks=50):
        """
        BONUS 2: "Artist Radio" - Radio basée sur un artiste
        Inspiré des radios Spotify/Apple Music
        """
        logger.info("=== BONUS 2: ARTIST RADIO ===")
        logger.info(f"Création radio pour l'artiste: {seed_artist}")
        
        # 1. Vérifier que l'artiste existe
        artist_songs = self.tracks_df[self.tracks_df['artist'] == seed_artist]
        if artist_songs.empty:
            logger.warning(f"Artiste '{seed_artist}' non trouvé")
            return pd.DataFrame()
        
        # 2. Analyser le profil de l'artiste
        artist_profile = self._analyze_artist_profile(seed_artist)
        
        # 3. Construire la radio par expansion
        radio_tracks = []
        
        # 20% - Tracks de l'artiste original (hits principaux)
        original_tracks = self._get_artist_hits(seed_artist, int(n_tracks * 0.2))
        radio_tracks.extend([(song_id, score, "Original Artist") for song_id, score in original_tracks])
        
        # 40% - Artistes similaires par genre
        genre_similar = self._get_genre_similar_artists(seed_artist, artist_profile, int(n_tracks * 0.4))
        radio_tracks.extend([(song_id, score, "Similar Genre") for song_id, score in genre_similar])
        
        # 30% - Artistes co-écoutés (collaborative)
        collab_similar = self._get_collaborative_similar_artists(seed_artist, int(n_tracks * 0.3))
        radio_tracks.extend([(song_id, score, "Fans Also Like") for song_id, score in collab_similar])
        
        # 10% - Découverte (nouveaux artistes du même genre)
        discovery = self._get_radio_discovery(artist_profile, int(n_tracks * 0.1))
        radio_tracks.extend([(song_id, score, "Discovery") for song_id, score in discovery])
        
        # 4. Créer un flow naturel (variation d'énergie)
        final_radio = self._create_radio_flow(radio_tracks, n_tracks)
        
        # 5. Formatage
        radio = []
        for i, (song_id, score, reason) in enumerate(final_radio, 1):
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            if not track_info.empty:
                track_info = track_info.iloc[0]
                radio.append({
                    'position': i,
                    'artist': track_info['artist'],
                    'title': track_info['title'],
                    'song_id': song_id,
                    'radio_score': float(score),
                    'category': reason
                })
        
        df_result = pd.DataFrame(radio)
        safe_artist = seed_artist.replace('/', '_').replace(' ', '_')[:20]
        df_result.to_csv(f'results/bonus_artist_radio_{safe_artist}.csv', index=False)
        
        logger.info(f"✅ Artist Radio créée: {len(df_result)} tracks")
        return df_result
    
    def made_for_you_playlists(self, user_id=None):
        """
        BONUS 3: "Made for You" - Playlists personnalisées contextuelles
        Inspiré des playlists "Made for You" de Spotify
        """
        logger.info("=== BONUS 3: MADE FOR YOU PLAYLISTS ===")
        
        if user_id is None:
            user_id = random.choice(list(self.user_to_idx.keys()))
        
        if user_id not in self.user_to_idx:
            logger.warning(f"Utilisateur {user_id} non trouvé")
            return {}
        
        logger.info(f"Génération playlists personnalisées pour {user_id}...")
        
        # Analyser le profil utilisateur
        user_profile = self._analyze_user_profile(user_id)
        playlists = {}
        
        # Playlist 1: "Your Time Capsule" - Hits de son époque préférée
        time_capsule = self._create_time_capsule_playlist(user_id, user_profile)
        playlists['time_capsule'] = time_capsule
        
        # Playlist 2: "Chill Mix" - Tracks relaxantes basées sur ses goûts
        chill_mix = self._create_chill_mix_playlist(user_id, user_profile)
        playlists['chill_mix'] = chill_mix
        
        # Playlist 3: "Workout Mix" - Tracks énergiques
        workout_mix = self._create_workout_mix_playlist(user_id, user_profile)
        playlists['workout_mix'] = workout_mix
        
        # Sauvegarder chaque playlist
        for playlist_name, playlist_data in playlists.items():
            if playlist_data:
                df = pd.DataFrame(playlist_data)
                df.to_csv(f'results/bonus_made_for_you_{playlist_name}_{user_id[:8]}.csv', index=False)
                logger.info(f"✅ {playlist_name.replace('_', ' ').title()}: {len(df)} tracks")
        
        return playlists
    
    # === MÉTHODES UTILITAIRES ===
    
    def _analyze_user_profile(self, user_id):
        """Analyse le profil d'un utilisateur"""
        user_songs = self.triplets_df[self.triplets_df['user_id'] == user_id]
        
        # Genres préférés
        user_tracks = user_songs.merge(self.tracks_df, on='song_id')
        if not self.genre_df.empty:
            user_genres = user_tracks.merge(self.genre_df, on='track_id', how='left')
            top_genres = user_genres['majority_genre'].value_counts().head(5).index.tolist()
        else:
            top_genres = ['Rock', 'Pop']  # Défaut
        
        # Artistes préférés
        top_artists = user_tracks.groupby('artist')['play_count'].sum().nlargest(10).index.tolist()
        
        return {
            'top_genres': top_genres,
            'top_artists': top_artists,
            'total_plays': user_songs['play_count'].sum(),
            'unique_songs': len(user_songs),
            'avg_plays_per_song': user_songs['play_count'].mean()
        }
    
    def _analyze_artist_profile(self, artist):
        """Analyse le profil d'un artiste"""
        artist_tracks = self.tracks_df[self.tracks_df['artist'] == artist]
        
        # Genre principal
        if not self.genre_df.empty:
            artist_genres = artist_tracks.merge(self.genre_df, on='track_id', how='left')
            main_genre = artist_genres['majority_genre'].mode()
            main_genre = main_genre.iloc[0] if len(main_genre) > 0 else 'Unknown'
        else:
            main_genre = 'Rock'  # Défaut
        
        # Popularité
        artist_plays = self.triplets_df[
            self.triplets_df['song_id'].isin(artist_tracks['song_id'])
        ]['play_count'].sum()
        
        return {
            'main_genre': main_genre,
            'total_tracks': len(artist_tracks),
            'total_plays': artist_plays
        }
    
    def _get_cf_discoveries(self, user_id, n_tracks):
        """Recommandations collaborative filtering pour découverte"""
        # Simplification: prendre des artistes populaires non écoutés
        user_songs = set(self.triplets_df[self.triplets_df['user_id'] == user_id]['song_id'])
        
        # Tracks populaires non écoutées
        all_popularity = self.triplets_df.groupby('song_id')['play_count'].sum()
        unheard_songs = [song for song in all_popularity.index if song not in user_songs]
        
        # Prendre les plus populaires parmi les non-écoutées
        recommendations = []
        for song_id in unheard_songs[:n_tracks*2]:  # Buffer pour diversité
            score = all_popularity[song_id]
            recommendations.append((song_id, score, "Collaborative Discovery"))
        
        return sorted(recommendations, key=lambda x: x[1], reverse=True)[:n_tracks]
    
    def _get_genre_expansion(self, user_id, user_profile, n_tracks):
        """Expansion par genres préférés"""
        recommendations = []
        user_songs = set(self.triplets_df[self.triplets_df['user_id'] == user_id]['song_id'])
        
        for genre in user_profile['top_genres'][:3]:  # Top 3 genres
            if not self.genre_df.empty:
                genre_tracks = self.genre_df[self.genre_df['majority_genre'] == genre]['track_id']
                genre_songs = self.tracks_df[self.tracks_df['track_id'].isin(genre_tracks)]['song_id']
                
                # Exclure les déjà écoutées
                new_genre_songs = [s for s in genre_songs if s not in user_songs]
                
                # Prendre quelques tracks populaires de ce genre
                for song_id in new_genre_songs[:n_tracks//3]:
                    popularity = self.triplets_df[self.triplets_df['song_id'] == song_id]['play_count'].sum()
                    if popularity > 0:
                        recommendations.append((song_id, popularity, f"Genre Expansion ({genre})"))
        
        return recommendations[:n_tracks]
    
    def _get_pure_discovery(self, user_id, n_tracks):
        """Découverte pure - trends et diversité"""
        user_songs = set(self.triplets_df[self.triplets_df['user_id'] == user_id]['song_id'])
        user_artists = set(self.triplets_df[
            self.triplets_df['user_id'] == user_id
        ].merge(self.tracks_df, on='song_id')['artist'])
        
        # Nouveaux artistes populaires
        all_artists_popularity = self.triplets_df.merge(
            self.tracks_df, on='song_id'
        ).groupby('artist')['play_count'].sum().sort_values(ascending=False)
        
        recommendations = []
        for artist in all_artists_popularity.index:
            if artist not in user_artists:
                artist_songs = self.tracks_df[self.tracks_df['artist'] == artist]['song_id']
                unheard_songs = [s for s in artist_songs if s not in user_songs]
                
                if unheard_songs:
                    # Prendre la chanson la plus populaire de cet artiste
                    best_song = None
                    best_popularity = 0
                    for song_id in unheard_songs:
                        pop = self.triplets_df[self.triplets_df['song_id'] == song_id]['play_count'].sum()
                        if pop > best_popularity:
                            best_popularity = pop
                            best_song = song_id
                    
                    if best_song and best_popularity > 0:
                        recommendations.append((best_song, best_popularity, "New Artist Discovery"))
                        
                        if len(recommendations) >= n_tracks:
                            break
        
        return recommendations[:n_tracks]
    
    def _diversify_playlist(self, recommendations, n_tracks):
        """Diversifie une playlist pour éviter la répétition"""
        # Déduplication par song_id
        seen_songs = set()
        seen_artists = set()
        diversified = []
        
        # Trier par score
        recommendations.sort(key=lambda x: x[1], reverse=True)
        
        for song_id, score, reason in recommendations:
            if song_id not in seen_songs:
                track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
                if not track_info.empty:
                    artist = track_info.iloc[0]['artist']
                    
                    # Limiter à 2 chansons par artiste
                    artist_count = sum(1 for _, _, _, a in [(s, sc, r, 
                        self.tracks_df[self.tracks_df['song_id'] == s].iloc[0]['artist'] 
                        if not self.tracks_df[self.tracks_df['song_id'] == s].empty else 'Unknown') 
                        for s, sc, r in diversified] if a == artist)
                    
                    if artist_count < 2:
                        diversified.append((song_id, score, reason))
                        seen_songs.add(song_id)
                        
                        if len(diversified) >= n_tracks:
                            break
        
        return diversified
    
    def _get_artist_hits(self, artist, n_tracks):
        """Récupère les hits d'un artiste"""
        artist_songs = self.tracks_df[self.tracks_df['artist'] == artist]['song_id']
        
        hits = []
        for song_id in artist_songs:
            popularity = self.triplets_df[self.triplets_df['song_id'] == song_id]['play_count'].sum()
            if popularity > 0:
                hits.append((song_id, popularity))
        
        return sorted(hits, key=lambda x: x[1], reverse=True)[:n_tracks]
    
    def _get_genre_similar_artists(self, seed_artist, artist_profile, n_tracks):
        """Trouve des artistes similaires par genre"""
        main_genre = artist_profile['main_genre']
        
        if self.genre_df.empty:
            return []
        
        # Artistes du même genre
        genre_tracks = self.genre_df[self.genre_df['majority_genre'] == main_genre]['track_id']
        genre_artists = self.tracks_df[
            (self.tracks_df['track_id'].isin(genre_tracks)) &
            (self.tracks_df['artist'] != seed_artist)
        ]['artist'].unique()
        
        similar = []
        for artist in genre_artists[:n_tracks*2]:  # Buffer
            artist_songs = self.tracks_df[self.tracks_df['artist'] == artist]['song_id']
            if len(artist_songs) > 0:
                # Prendre la chanson la plus populaire
                best_song = None
                best_pop = 0
                for song_id in artist_songs:
                    pop = self.triplets_df[self.triplets_df['song_id'] == song_id]['play_count'].sum()
                    if pop > best_pop:
                        best_pop = pop
                        best_song = song_id
                
                if best_song and best_pop > 0:
                    similar.append((best_song, best_pop))
        
        return sorted(similar, key=lambda x: x[1], reverse=True)[:n_tracks]
    
    def _get_collaborative_similar_artists(self, seed_artist, n_tracks):
        """Trouve des artistes via collaborative filtering"""
        # Simplification: artistes populaires différents
        seed_songs = self.tracks_df[self.tracks_df['artist'] == seed_artist]['song_id']
        
        # Utilisateurs qui écoutent cet artiste
        seed_users = self.triplets_df[
            self.triplets_df['song_id'].isin(seed_songs)
        ]['user_id'].unique()
        
        # Autres artistes écoutés par ces utilisateurs
        other_artists_plays = self.triplets_df[
            (self.triplets_df['user_id'].isin(seed_users)) &
            (~self.triplets_df['song_id'].isin(seed_songs))
        ].merge(self.tracks_df, on='song_id').groupby('artist')['play_count'].sum()
        
        similar = []
        for artist, total_plays in other_artists_plays.nlargest(n_tracks*2).items():
            if artist != seed_artist:
                # Meilleure chanson de cet artiste
                artist_songs = self.tracks_df[self.tracks_df['artist'] == artist]['song_id']
                best_song = None
                best_pop = 0
                for song_id in artist_songs:
                    pop = self.triplets_df[self.triplets_df['song_id'] == song_id]['play_count'].sum()
                    if pop > best_pop:
                        best_pop = pop
                        best_song = song_id
                
                if best_song:
                    similar.append((best_song, total_plays))
        
        return similar[:n_tracks]
    
    def _get_radio_discovery(self, artist_profile, n_tracks):
        """Découverte pour la radio"""
        main_genre = artist_profile['main_genre']
        
        if self.genre_df.empty:
            return []
        
        # Nouveaux artistes du même genre
        genre_tracks = self.genre_df[self.genre_df['majority_genre'] == main_genre]['track_id']
        genre_songs = self.tracks_df[self.tracks_df['track_id'].isin(genre_tracks)]
        
        discovery = []
        artists_seen = set()
        
        for _, track in genre_songs.sample(min(100, len(genre_songs))).iterrows():
            artist = track['artist']
            song_id = track['song_id']
            
            if artist not in artists_seen:
                popularity = self.triplets_df[self.triplets_df['song_id'] == song_id]['play_count'].sum()
                if popularity > 0:
                    discovery.append((song_id, popularity))
                    artists_seen.add(artist)
                    
                    if len(discovery) >= n_tracks:
                        break
        
        return discovery
    
    def _create_radio_flow(self, tracks, n_tracks):
        """Crée un flow naturel pour la radio"""
        # Pour simplifier, on mélange et on prend les meilleurs
        random.shuffle(tracks)
        return tracks[:n_tracks]
    
    def _create_time_capsule_playlist(self, user_id, user_profile):
        """Crée une playlist Time Capsule"""
        # Prendre les hits des artistes préférés
        playlist = []
        user_songs = set(self.triplets_df[self.triplets_df['user_id'] == user_id]['song_id'])
        
        for artist in user_profile['top_artists'][:5]:
            artist_songs = self.tracks_df[self.tracks_df['artist'] == artist]['song_id']
            unheard = [s for s in artist_songs if s not in user_songs]
            
            for song_id in unheard[:2]:  # 2 chansons par artiste
                track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
                if not track_info.empty:
                    track_info = track_info.iloc[0]
                    playlist.append({
                        'rank': len(playlist) + 1,
                        'artist': track_info['artist'],
                        'title': track_info['title'],
                        'song_id': song_id,
                        'playlist_type': 'Time Capsule',
                        'reason': f"More from {artist}"
                    })
        
        return playlist[:20]  # Limite à 20
    
    def _create_chill_mix_playlist(self, user_id, user_profile):
        """Crée un Chill Mix"""
        # Version simplifiée: prendre des tracks populaires des genres calmes
        chill_genres = ['Jazz', 'Blues', 'Pop']  # Genres "chill"
        playlist = []
        user_songs = set(self.triplets_df[self.triplets_df['user_id'] == user_id]['song_id'])
        
        for genre in chill_genres:
            if not self.genre_df.empty:
                genre_tracks = self.genre_df[self.genre_df['majority_genre'] == genre]['track_id']
                genre_songs = self.tracks_df[self.tracks_df['track_id'].isin(genre_tracks)]['song_id']
                unheard = [s for s in genre_songs if s not in user_songs]
                
                # Prendre quelques populaires
                for song_id in unheard[:5]:
                    track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
                    if not track_info.empty:
                        popularity = self.triplets_df[self.triplets_df['song_id'] == song_id]['play_count'].sum()
                        if popularity > 0:
                            track_info = track_info.iloc[0]
                            playlist.append({
                                'rank': len(playlist) + 1,
                                'artist': track_info['artist'],
                                'title': track_info['title'],
                                'song_id': song_id,
                                'playlist_type': 'Chill Mix',
                                'reason': f"Chill {genre}"
                            })
                            
                            if len(playlist) >= 15:
                                return playlist
        
        return playlist
    
    def _create_workout_mix_playlist(self, user_id, user_profile):
        """Crée un Workout Mix"""
        # Version simplifiée: genres énergiques
        energy_genres = ['Rock', 'Rap', 'Electronic']
        playlist = []
        user_songs = set(self.triplets_df[self.triplets_df['user_id'] == user_id]['song_id'])
        
        for genre in energy_genres:
            if not self.genre_df.empty:
                genre_tracks = self.genre_df[self.genre_df['majority_genre'] == genre]['track_id']
                genre_songs = self.tracks_df[self.tracks_df['track_id'].isin(genre_tracks)]['song_id']
                unheard = [s for s in genre_songs if s not in user_songs]
                
                for song_id in unheard[:5]:
                    track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
                    if not track_info.empty:
                        popularity = self.triplets_df[self.triplets_df['song_id'] == song_id]['play_count'].sum()
                        if popularity > 0:
                            track_info = track_info.iloc[0]
                            playlist.append({
                                'rank': len(playlist) + 1,
                                'artist': track_info['artist'],
                                'title': track_info['title'],
                                'song_id': song_id,
                                'playlist_type': 'Workout Mix',
                                'reason': f"High Energy {genre}"
                            })
                            
                            if len(playlist) >= 15:
                                return playlist
        
        return playlist

def main():
    """Exécute toutes les fonctionnalités bonus"""
    import os
    os.makedirs('results', exist_ok=True)
    
    try:
        spotify_features = SpotifyInspiredFeatures()
        
        logger.info("\n🎵 EXÉCUTION DES FONCTIONNALITÉS BONUS 🎵")
        logger.info("Inspirées de Spotify, Apple Music, etc.")
        print("="*60)
        
        # BONUS 1: Discover Weekly
        print("\n🔍 1. Discover Weekly...")
        discover_weekly = spotify_features.discover_weekly()
        if not discover_weekly.empty:
            print(f"✅ Discover Weekly généré: {len(discover_weekly)} tracks")
            print("   Top 3:")
            for _, row in discover_weekly.head(3).iterrows():
                print(f"   {row['rank']}. {row['artist']} - {row['title']}")
                print(f"      Raison: {row['reason']}")
        
        # BONUS 2: Artist Radio
        print("\n📻 2. Artist Radio...")
        # Prendre un artiste populaire
        popular_artists = spotify_features.tracks_df.merge(
            spotify_features.triplets_df, on='song_id'
        ).groupby('artist')['play_count'].sum().nlargest(10)
        
        if len(popular_artists) > 0:
            seed_artist = popular_artists.index[0]
            artist_radio = spotify_features.artist_radio(seed_artist)
            if not artist_radio.empty:
                print(f"✅ Artist Radio '{seed_artist}' généré: {len(artist_radio)} tracks")
                print("   Aperçu:")
                for _, row in artist_radio.head(3).iterrows():
                    print(f"   {row['position']}. {row['artist']} - {row['title']}")
                    print(f"      Catégorie: {row['category']}")
        
        # BONUS 3: Made for You Playlists
        print("\n🎯 3. Made for You Playlists...")
        made_for_you = spotify_features.made_for_you_playlists()
        
        total_playlists = len(made_for_you)
        total_tracks = sum(len(playlist) for playlist in made_for_you.values())
        print(f"✅ Made for You généré: {total_playlists} playlists, {total_tracks} tracks total")
        
        for playlist_name, playlist_data in made_for_you.items():
            if playlist_data:
                print(f"   📋 {playlist_name.replace('_', ' ').title()}: {len(playlist_data)} tracks")
        
        # Résumé final
        print("\n" + "="*60)
        print("🎉 TOUTES LES FONCTIONNALITÉS BONUS EXÉCUTÉES!")
        print("="*60)
        print("📁 Fichiers générés dans 'results/':")
        
        bonus_files = [f for f in os.listdir('results') if f.startswith('bonus_')]
        for file in bonus_files:
            file_path = os.path.join('results', file)
            size_kb = os.path.getsize(file_path) / 1024
            print(f"   📄 {file} ({size_kb:.1f} KB)")
        
        print(f"\n✨ {len(bonus_files)} fichiers bonus créés!")
        print("\n🎵 Fonctionnalités implémentées:")
        print("   ✅ 1. Discover Weekly (personnalisée)")
        print("   ✅ 2. Artist Radio (expansion contextuelle)")
        print("   ✅ 3. Made for You (playlists contextuelles)")
        
    except Exception as e:
        logger.error(f"❌ Erreur lors de l'exécution du bonus: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

2025-08-24 01:48:41,365 - INFO - Chargement des données de base...
2025-08-24 01:49:11,006 - INFO - ✅ Données de base chargées
2025-08-24 01:49:11,014 - INFO - 
🎵 EXÉCUTION DES FONCTIONNALITÉS BONUS 🎵
2025-08-24 01:49:11,015 - INFO - Inspirées de Spotify, Apple Music, etc.
2025-08-24 01:49:11,015 - INFO - === BONUS 1: DISCOVER WEEKLY ===
2025-08-24 01:49:11,026 - INFO - Génération Discover Weekly pour 9bd1a6b1d802d781b494172bdeffba7f32883e53...



🔍 1. Discover Weekly...


KeyboardInterrupt: 

In [42]:
# quick_bonus_features.py - Version simplifiée et rapide des fonctionnalités bonus
import pandas as pd
import numpy as np
import random
import os
from pathlib import Path

class QuickSpotifyFeatures:
    """Version simplifiée des fonctionnalités Spotify pour exécution rapide"""
    
    def __init__(self, sample_size=10000):
        """
        sample_size: Nombre de lignes à charger pour accélérer l'exécution
        """
        self.sample_size = sample_size
        print(f"🚀 Mode rapide: échantillon de {sample_size} interactions")
        self.load_sample_data()
    
    def load_sample_data(self):
        """Charge un échantillon des données pour exécution rapide"""
        print("📊 Chargement échantillon des données...")
        
        # Charger plus de données pour avoir plus de variété
        self.triplets_df = pd.read_csv(
            "train_triplets.txt", sep='\t', 
            names=['user_id', 'song_id', 'play_count'],
            nrows=self.sample_size
        )
        
        self.tracks_df = pd.read_csv(
            "p02_unique_tracks.txt", sep='<SEP>',
            names=['track_id', 'song_id', 'artist', 'title'],
            engine='python', nrows=5000  # Plus de tracks pour plus de variété
        )
        
        # Créer mappings rapides
        self.users = self.triplets_df['user_id'].unique()
        self.songs = self.triplets_df['song_id'].unique()
        
        print(f"✅ Données chargées: {len(self.triplets_df)} interactions, {len(self.users)} users, {len(self.songs)} songs")
    
    def quick_discover_weekly(self, user_id=None, n_tracks=15):
        """Version rapide de Discover Weekly"""
        print("\n🔍 BONUS 1: QUICK DISCOVER WEEKLY")
        print("-" * 40)
        
        if user_id is None:
            user_id = random.choice(self.users)
        
        print(f"👤 Génération pour utilisateur: {user_id[:10]}...")
        
        # 1. Ce que l'utilisateur a déjà écouté
        user_songs = set(self.triplets_df[
            self.triplets_df['user_id'] == user_id
        ]['song_id'])
        
        print(f"   🎵 Utilisateur a écouté: {len(user_songs)} chansons")
        
        # 2. Collaborative simple: utilisateurs similaires
        similar_users = self._find_similar_users_quick(user_id, top_k=10)
        
        # 3. Recommandations des utilisateurs similaires
        cf_recs = []
        for similar_user in similar_users:
            similar_songs = self.triplets_df[
                self.triplets_df['user_id'] == similar_user
            ]['song_id'].values
            
            # Songs pas encore écoutées
            new_songs = [s for s in similar_songs if s not in user_songs]
            cf_recs.extend(new_songs[:3])  # 3 par utilisateur similaire
        
        # 4. Popularité générale (songs trending)
        popular_songs = self.triplets_df.groupby('song_id')['play_count'].sum().nlargest(50)
        trending_recs = [s for s in popular_songs.index if s not in user_songs][:n_tracks//2]
        
        # 5. Mélange final
        all_recommendations = list(set(cf_recs + trending_recs))
        random.shuffle(all_recommendations)
        final_recs = all_recommendations[:n_tracks]
        
        # 6. Formatage
        playlist = []
        for i, song_id in enumerate(final_recs, 1):
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            
            if not track_info.empty:
                track = track_info.iloc[0]
                reason = "Similar Users" if song_id in cf_recs else "Trending"
                
                playlist.append({
                    'rank': i,
                    'artist': track['artist'],
                    'title': track['title'],
                    'song_id': song_id,
                    'reason': reason
                })
        
        # Sauvegarder
        if playlist:
            df = pd.DataFrame(playlist)
            filename = f'quick_discover_weekly_{user_id[:8]}.csv'
            filepath = f'results/{filename}'
            df.to_csv(filepath, index=False)
            
            print(f"✅ Discover Weekly créé: {len(playlist)} tracks")
            print(f"   💾 Sauvegardé: {filename}")
            print("   Top 3 découvertes:")
            for _, row in df.head(3).iterrows():
                print(f"   {row['rank']}. {row['artist']} - {row['title']} ({row['reason']})")
            
            return df
        else:
            print("❌ Aucune recommandation Discover Weekly générée")
            return pd.DataFrame()
    
    def quick_artist_radio(self, seed_artist=None, n_tracks=20):
        """Version rapide d'Artist Radio"""
        print("\n📻 BONUS 2: QUICK ARTIST RADIO")
        print("-" * 40)
        
        if seed_artist is None:
            # Prendre un artiste populaire au hasard
            popular_artists = self.triplets_df.merge(
                self.tracks_df, on='song_id'
            ).groupby('artist')['play_count'].sum().nlargest(20)
            seed_artist = random.choice(popular_artists.index.tolist())
        
        print(f"🎤 Radio pour: {seed_artist}")
        
        # 1. Songs de l'artiste original
        artist_songs = self.tracks_df[
            self.tracks_df['artist'] == seed_artist
        ]['song_id'].values
        
        original_hits = []
        for song_id in artist_songs[:n_tracks//4]:  # 25% de l'artiste original
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            if not track_info.empty:
                track = track_info.iloc[0]
                original_hits.append((song_id, track['artist'], track['title'], "Original Artist"))
        
        # 2. Utilisateurs qui écoutent cet artiste
        artist_fans = self.triplets_df[
            self.triplets_df['song_id'].isin(artist_songs)
        ]['user_id'].unique()
        
        print(f"   👥 Fans trouvés: {len(artist_fans)}")
        
        # 3. Autres artistes écoutés par ces fans
        fan_music = self.triplets_df[
            self.triplets_df['user_id'].isin(artist_fans)
        ].merge(self.tracks_df, on='song_id')
        
        similar_artists = fan_music[
            fan_music['artist'] != seed_artist
        ].groupby('artist')['play_count'].sum().nlargest(10)
        
        # 4. Prendre hits des artistes similaires
        similar_hits = []
        for artist in similar_artists.index[:5]:
            artist_tracks = self.tracks_df[self.tracks_df['artist'] == artist]
            for _, track in artist_tracks.head(2).iterrows():  # 2 par artiste
                similar_hits.append((track['song_id'], track['artist'], track['title'], "Similar Artist"))
        
        # 4. Quelques tracks populaires générales pour remplir
        general_popular = self.triplets_df.groupby('song_id')['play_count'].sum().nlargest(50)
        general_hits = []
        for song_id in general_popular.index:
            if len(general_hits) >= n_tracks//2:  # Remplir jusqu'à la moitié
                break
            track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
            if not track_info.empty:
                track = track_info.iloc[0]
                if track['artist'] != seed_artist:  # Éviter l'artiste seed
                    general_hits.append((song_id, track['artist'], track['title'], "Popular"))
        
        # 5. Combiner et garantir un minimum de tracks
        all_radio = original_hits + similar_hits + general_hits
        
        # Si pas assez, ajouter plus de tracks populaires
        if len(all_radio) < n_tracks:
            additional_popular = self.triplets_df.merge(
                self.tracks_df, on='song_id'
            ).groupby(['artist', 'title', 'song_id'])['play_count'].sum().nlargest(n_tracks*2)
            
            existing_songs = set([song_id for song_id, _, _, _ in all_radio])
            
            for (artist, title, song_id), _ in additional_popular.items():
                if song_id not in existing_songs and len(all_radio) < n_tracks:
                    all_radio.append((song_id, artist, title, "Filler"))
        
        # Mélanger et prendre exactement n_tracks
        random.shuffle(all_radio)
        final_radio = all_radio[:n_tracks]
        
        # 7. Formatage
        radio = []
        for i, (song_id, artist, title, category) in enumerate(final_radio, 1):
            radio.append({
                'position': i,
                'artist': artist,
                'title': title,
                'song_id': song_id,
                'category': category
            })
        
        # Sauvegarder
        if radio:
            df = pd.DataFrame(radio)
            safe_artist = seed_artist.replace('/', '_').replace(' ', '_').replace(';', '_')[:15]
            filename = f'quick_artist_radio_{safe_artist}.csv'
            filepath = f'results/{filename}'
            df.to_csv(filepath, index=False)
            
            print(f"✅ Artist Radio créé: {len(radio)} tracks")
            print(f"   💾 Sauvegardé: {filename}")
            print("   Composition:")
            categories = pd.Series([r['category'] for r in radio]).value_counts()
            for cat, count in categories.items():
                print(f"   • {cat}: {count} tracks")
            
            print("   Aperçu:")
            for _, row in df.head(3).iterrows():
                print(f"   {row['position']}. {row['artist']} - {row['title']} ({row['category']})")
            
            return df
        else:
            print("❌ Aucune radio générée")
            return pd.DataFrame()
    
    def quick_made_for_you(self, user_id=None):
        """Version rapide des playlists Made for You"""
        print("\n🎯 BONUS 3: QUICK MADE FOR YOU")
        print("-" * 40)
        
        if user_id is None:
            # Choisir un utilisateur avec plus d'historique
            user_song_counts = self.triplets_df.groupby('user_id').size()
            active_users = user_song_counts[user_song_counts >= 5].index  # Au moins 5 chansons
            if len(active_users) > 0:
                user_id = random.choice(active_users)
            else:
                user_id = random.choice(self.users)
        
        print(f"👤 Playlists pour utilisateur: {user_id[:10]}...")
        
        # Analyser l'utilisateur rapidement
        user_songs = self.triplets_df[self.triplets_df['user_id'] == user_id]
        print(f"   🎵 Historique: {len(user_songs)} chansons")
        
        user_artists = user_songs.merge(self.tracks_df, on='song_id')['artist'].value_counts()
        
        playlists = {}
        
        # 1. Time Capsule - Plus de vos artistes préférés
        time_capsule = []
        if len(user_artists) > 0:
            print(f"   🎤 Top artistes: {user_artists.head(3).index.tolist()}")
            
            for artist in user_artists.head(5).index:  # Top 5 artistes au lieu de 3
                artist_tracks = self.tracks_df[self.tracks_df['artist'] == artist]
                user_artist_songs = set(user_songs['song_id'])
                
                # Chansons de cet artiste pas encore écoutées
                new_songs = artist_tracks[~artist_tracks['song_id'].isin(user_artist_songs)]
                
                for _, track in new_songs.head(2).iterrows():  # 2 par artiste
                    time_capsule.append({
                        'rank': len(time_capsule) + 1,
                        'artist': track['artist'],
                        'title': track['title'],
                        'song_id': track['song_id'],
                        'reason': f"More from {artist}"
                    })
                    
                    if len(time_capsule) >= 10:  # Limite
                        break
        
        # Si Time Capsule toujours vide, remplir avec tracks populaires des mêmes genres
        if len(time_capsule) == 0:
            print("   🔄 Remplissage Time Capsule avec recommandations génériques...")
            popular_tracks = self.triplets_df.merge(
                self.tracks_df, on='song_id'
            ).groupby(['artist', 'title', 'song_id'])['play_count'].sum().nlargest(20)
            
            user_song_set = set(user_songs['song_id'])
            for (artist, title, song_id), _ in popular_tracks.items():
                if song_id not in user_song_set and len(time_capsule) < 8:
                    time_capsule.append({
                        'rank': len(time_capsule) + 1,
                        'artist': artist,
                        'title': title,
                        'song_id': song_id,
                        'reason': "Popular Pick"
                    })
        
        playlists['time_capsule'] = time_capsule
        
        # 2. Discovery Mix - Nouvelles découvertes
        all_user_songs = set(user_songs['song_id'])
        popular_unheard = self.triplets_df.groupby('song_id')['play_count'].sum().nlargest(100)  # Plus large pool
        
        discovery_mix = []
        for song_id in popular_unheard.index:
            if song_id not in all_user_songs:
                track_info = self.tracks_df[self.tracks_df['song_id'] == song_id]
                if not track_info.empty:
                    track = track_info.iloc[0]
                    discovery_mix.append({
                        'rank': len(discovery_mix) + 1,
                        'artist': track['artist'],
                        'title': track['title'],
                        'song_id': track['song_id'],
                        'reason': "New Discovery"
                    })
                    
                    if len(discovery_mix) >= 8:  # Limite à 8
                        break
        
        playlists['discovery_mix'] = discovery_mix
        
        # Sauvegarder les playlists
        saved_playlists = {}
        total_tracks = 0
        
        for playlist_name, playlist_data in playlists.items():
            if playlist_data:
                df = pd.DataFrame(playlist_data)
                filename = f'quick_made_for_you_{playlist_name}_{user_id[:8]}.csv'
                filepath = f'results/{filename}'
                df.to_csv(filepath, index=False)
                saved_playlists[playlist_name] = df
                total_tracks += len(playlist_data)
                
                print(f"✅ {playlist_name.replace('_', ' ').title()}: {len(playlist_data)} tracks")
                print(f"   💾 Sauvegardé: {filename}")
                if len(playlist_data) > 0:
                    print(f"   Exemple: {playlist_data[0]['artist']} - {playlist_data[0]['title']}")
            else:
                print(f"❌ {playlist_name.replace('_', ' ').title()}: Aucune track générée")
        
        if total_tracks > 0:
            print(f"🎯 Total Made for You: {total_tracks} tracks dans {len(saved_playlists)} playlists")
        
        return saved_playlists
    
    def _find_similar_users_quick(self, target_user, top_k=10):
        """Trouve rapidement des utilisateurs similaires"""
        target_songs = set(self.triplets_df[
            self.triplets_df['user_id'] == target_user
        ]['song_id'])
        
        if len(target_songs) == 0:
            return []
        
        similarities = []
        for user in self.users[:100]:  # Limite à 100 users pour la rapidité
            if user != target_user:
                user_songs = set(self.triplets_df[
                    self.triplets_df['user_id'] == user
                ]['song_id'])
                
                # Similarité Jaccard simple
                intersection = len(target_songs.intersection(user_songs))
                union = len(target_songs.union(user_songs))
                
                if union > 0:
                    similarity = intersection / union
                    similarities.append((user, similarity))
        
        # Trier et retourner top-k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return [user for user, _ in similarities[:top_k]]

def main():
    """Exécution rapide des 3 fonctionnalités bonus"""
    
    # Créer dossier results
    os.makedirs('results', exist_ok=True)
    
    print("⚡ MYSPOTIFY BONUS - VERSION RAPIDE ⚡")
    print("="*50)
    print("🚀 Optimisé pour exécution rapide mais avec plus de données!")
    print()
    
    try:
        # Initialiser avec échantillon plus grand pour plus de variété
        spotify = QuickSpotifyFeatures(sample_size=10000)  # Plus d'interactions
        
        # Exécuter les 3 bonus
        print("\n🎵 EXÉCUTION DES 3 FONCTIONNALITÉS BONUS...")
        
        discover_weekly = spotify.quick_discover_weekly()
        artist_radio = spotify.quick_artist_radio()
        made_for_you = spotify.quick_made_for_you()
        
        print("\n" + "="*50)
        print("🎉 BONUS RAPIDE TERMINÉ!")
        print("="*50)
        
        # Lister les fichiers créés
        all_files = os.listdir('results') if os.path.exists('results') else []
        bonus_files = [f for f in all_files if f.startswith('quick_')]
        
        print(f"📁 Fichiers créés: {len(bonus_files)}")
        
        total_size = 0
        for file in bonus_files:
            file_path = os.path.join('results', file)
            if os.path.exists(file_path):
                size_kb = os.path.getsize(file_path) / 1024
                total_size += size_kb
                print(f"   📄 {file} ({size_kb:.1f} KB)")
        
        print(f"\n📊 RÉSUMÉ:")
        print(f"   ✅ 3 fonctionnalités bonus implémentées!")
        print(f"   📄 {len(bonus_files)} fichiers CSV générés")
        print(f"   💾 Taille totale: {total_size:.1f} KB")
        print(f"   ⏱️  Temps d'exécution: ~1 minute")
        print(f"   🎵 Inspiré de: Spotify, Apple Music")
        
        if len(bonus_files) >= 3:
            print(f"\n🎉 BONUS COMPLET - Toutes les fonctionnalités marchent!")
        else:
            print(f"\n⚠️  Seulement {len(bonus_files)} fichiers générés sur 3+ attendus")
        
    except Exception as e:
        print(f"❌ Erreur: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

⚡ MYSPOTIFY BONUS - VERSION RAPIDE ⚡
🚀 Optimisé pour exécution rapide mais avec plus de données!

🚀 Mode rapide: échantillon de 10000 interactions
📊 Chargement échantillon des données...
✅ Données chargées: 10000 interactions, 173 users, 7849 songs

🎵 EXÉCUTION DES 3 FONCTIONNALITÉS BONUS...

🔍 BONUS 1: QUICK DISCOVER WEEKLY
----------------------------------------
👤 Génération pour utilisateur: 2c42e65513...
   🎵 Utilisateur a écouté: 76 chansons
❌ Aucune recommandation Discover Weekly générée

📻 BONUS 2: QUICK ARTIST RADIO
----------------------------------------
🎤 Radio pour: Sammy Kershaw
   👥 Fans trouvés: 2
✅ Artist Radio créé: 20 tracks
   💾 Sauvegardé: quick_artist_radio_Sammy_Kershaw.csv
   Composition:
   • Filler: 17 tracks
   • Similar Artist: 2 tracks
   • Original Artist: 1 tracks
   Aperçu:
   1. The Knife - Silent Shout (Filler)
   2. Guns N' Roses - Street Of Dreams (Similar Artist)
   3. Paco De Lucia - Entre Dos Aguas (Filler)

🎯 BONUS 3: QUICK MADE FOR YOU
---------

In [43]:
# fix_discover_weekly.py - Correctif pour garantir 15 tracks dans Discover Weekly
import pandas as pd
import numpy as np
import random
import os

def generate_full_discover_weekly():
    """Génère une Discover Weekly complète avec 15 tracks garanties"""
    
    print("🔧 CORRECTIF DISCOVER WEEKLY")
    print("="*40)
    
    # Charger données
    print("📊 Chargement des données...")
    triplets_df = pd.read_csv(
        "train_triplets.txt", sep='\t', 
        names=['user_id', 'song_id', 'play_count'],
        nrows=10000
    )
    
    tracks_df = pd.read_csv(
        "p02_unique_tracks.txt", sep='<SEP>',
        names=['track_id', 'song_id', 'artist', 'title'],
        engine='python', nrows=5000
    )
    
    users = triplets_df['user_id'].unique()
    
    # Choisir un utilisateur avec un bon historique
    user_song_counts = triplets_df.groupby('user_id').size()
    active_users = user_song_counts[user_song_counts >= 10].index  # Au moins 10 chansons
    
    if len(active_users) > 0:
        target_user = random.choice(active_users)
    else:
        target_user = random.choice(users)
    
    print(f"👤 Utilisateur sélectionné: {target_user[:12]}...")
    
    # Analyser l'utilisateur
    user_songs = triplets_df[triplets_df['user_id'] == target_user]
    user_song_ids = set(user_songs['song_id'])
    
    print(f"🎵 Historique utilisateur: {len(user_song_ids)} chansons")
    
    # 1. Recommendations collaborative (utilisateurs similaires)
    print("🔍 Recherche utilisateurs similaires...")
    
    similar_users = find_similar_users(triplets_df, target_user, user_song_ids, top_k=20)
    print(f"👥 Utilisateurs similaires trouvés: {len(similar_users)}")
    
    cf_recommendations = []
    for similar_user in similar_users[:10]:  # Top 10 similaires
        similar_songs = triplets_df[triplets_df['user_id'] == similar_user]['song_id'].values
        new_songs = [s for s in similar_songs if s not in user_song_ids]
        cf_recommendations.extend(new_songs[:2])  # 2 par utilisateur similaire
    
    print(f"🤝 Recommandations collaborative: {len(set(cf_recommendations))}")
    
    # 2. Tracks populaires globales non écoutées
    print("📈 Recherche tracks populaires...")
    
    all_popularity = triplets_df.groupby('song_id')['play_count'].sum().sort_values(ascending=False)
    popular_unheard = [song for song in all_popularity.index if song not in user_song_ids]
    
    print(f"🔥 Tracks populaires non écoutées: {len(popular_unheard)}")
    
    # 3. Découverte par artistes (artistes populaires non écoutés)
    print("🎤 Découverte nouveaux artistes...")
    
    user_artists = set(user_songs.merge(tracks_df, on='song_id')['artist'])
    all_tracks_with_artists = triplets_df.merge(tracks_df, on='song_id')
    
    new_artist_songs = []
    artist_popularity = all_tracks_with_artists.groupby('artist')['play_count'].sum().sort_values(ascending=False)
    
    for artist in artist_popularity.index:
        if artist not in user_artists:
            artist_tracks = tracks_df[tracks_df['artist'] == artist]['song_id'].values
            unheard_from_artist = [s for s in artist_tracks if s not in user_song_ids]
            if unheard_from_artist:
                # Prendre la plus populaire de cet artiste
                best_song = None
                best_pop = 0
                for song in unheard_from_artist:
                    pop = triplets_df[triplets_df['song_id'] == song]['play_count'].sum()
                    if pop > best_pop:
                        best_pop = pop
                        best_song = song
                if best_song:
                    new_artist_songs.append(best_song)
                    if len(new_artist_songs) >= 10:  # Limite
                        break
    
    print(f"🆕 Nouveaux artistes: {len(new_artist_songs)}")
    
    # 4. Combiner toutes les recommandations
    all_recommendations = []
    
    # Ajouter collaborative (40%)
    cf_unique = list(set(cf_recommendations))
    random.shuffle(cf_unique)
    for song in cf_unique[:6]:  # 6 tracks collaborative
        all_recommendations.append((song, "Similar Users"))
    
    # Ajouter populaires (35%)
    for song in popular_unheard[:5]:  # 5 tracks populaires
        if song not in [r[0] for r in all_recommendations]:
            all_recommendations.append((song, "Trending"))
    
    # Ajouter nouveaux artistes (25%)
    for song in new_artist_songs[:4]:  # 4 nouveaux artistes
        if song not in [r[0] for r in all_recommendations]:
            all_recommendations.append((song, "New Artist"))
    
    # 5. Compléter jusqu'à 15 si nécessaire
    existing_songs = set([r[0] for r in all_recommendations])
    
    if len(all_recommendations) < 15:
        print(f"🔄 Complément nécessaire: {15 - len(all_recommendations)} tracks")
        
        # Ajouter plus de tracks populaires
        for song in all_popularity.index:
            if song not in existing_songs and song not in user_song_ids:
                all_recommendations.append((song, "Popular Filler"))
                if len(all_recommendations) >= 15:
                    break
    
    # 6. Formatage final
    print("💫 Formatage de la playlist...")
    
    playlist = []
    for i, (song_id, reason) in enumerate(all_recommendations[:15], 1):
        track_info = tracks_df[tracks_df['song_id'] == song_id]
        
        if not track_info.empty:
            track = track_info.iloc[0]
            playlist.append({
                'rank': i,
                'artist': track['artist'],
                'title': track['title'],
                'song_id': song_id,
                'reason': reason
            })
    
    # 7. Sauvegarder
    if playlist:
        df = pd.DataFrame(playlist)
        filename = f'fixed_discover_weekly_{target_user[:8]}.csv'
        filepath = f'results/{filename}'
        df.to_csv(filepath, index=False)
        
        print(f"\n✅ DISCOVER WEEKLY CORRIGÉ!")
        print(f"💾 Sauvegardé: {filename}")
        print(f"🎵 Tracks générées: {len(playlist)}/15")
        
        # Répartition des sources
        reason_counts = df['reason'].value_counts()
        print(f"\n📊 Répartition des sources:")
        for reason, count in reason_counts.items():
            percentage = (count / len(df)) * 100
            print(f"   • {reason}: {count} tracks ({percentage:.1f}%)")
        
        print(f"\n🎧 Aperçu de votre Discover Weekly:")
        for _, row in df.head(5).iterrows():
            print(f"   {row['rank']:2d}. {row['artist']} - {row['title']}")
            print(f"       Source: {row['reason']}")
        
        if len(df) > 5:
            print(f"   ... et {len(df)-5} autres découvertes!")
        
        return df
    
    return None

def find_similar_users(triplets_df, target_user, target_songs, top_k=10):
    """Trouve des utilisateurs similaires plus efficacement"""
    
    similarities = []
    target_songs_set = set(target_songs)
    
    # Échantillonner les utilisateurs pour aller plus vite
    all_users = triplets_df['user_id'].unique()
    sample_users = random.sample(list(all_users), min(200, len(all_users)))
    
    for user in sample_users:
        if user != target_user:
            user_songs = set(triplets_df[triplets_df['user_id'] == user]['song_id'])
            
            # Similarité Jaccard
            intersection = len(target_songs_set.intersection(user_songs))
            union = len(target_songs_set.union(user_songs))
            
            if union > 0 and intersection > 0:  # Au moins une chanson en commun
                similarity = intersection / union
                similarities.append((user, similarity))
    
    # Trier et retourner top-k
    similarities.sort(key=lambda x: x[1], reverse=True)
    return [user for user, _ in similarities[:top_k]]

def main():
    """Exécute le correctif"""
    os.makedirs('results', exist_ok=True)
    
    try:
        result = generate_full_discover_weekly()
        
        if result is not None:
            print(f"\n🎉 CORRECTIF RÉUSSI!")
            print(f"📁 Fichier corrigé disponible dans results/")
            print(f"🔄 Remplace le fichier discover_weekly précédent")
        else:
            print(f"\n❌ Échec du correctif")
            
    except Exception as e:
        print(f"❌ Erreur: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

🔧 CORRECTIF DISCOVER WEEKLY
📊 Chargement des données...
👤 Utilisateur sélectionné: 2c42e6551311...
🎵 Historique utilisateur: 76 chansons
🔍 Recherche utilisateurs similaires...
👥 Utilisateurs similaires trouvés: 11
🤝 Recommandations collaborative: 19
📈 Recherche tracks populaires...
🔥 Tracks populaires non écoutées: 7773
🎤 Découverte nouveaux artistes...
🆕 Nouveaux artistes: 10
💫 Formatage de la playlist...

✅ DISCOVER WEEKLY CORRIGÉ!
💾 Sauvegardé: fixed_discover_weekly_2c42e655.csv
🎵 Tracks générées: 4/15

📊 Répartition des sources:
   • New Artist: 4 tracks (100.0%)

🎧 Aperçu de votre Discover Weekly:
   12. Nick Lowe - All Men Are Liars
       Source: New Artist
   13. Foo Fighters - Still
       Source: New Artist
   14. Operation Ivy - Knowledge
       Source: New Artist
   15. Michael Cera & Ellen Page - Anyone Else But You
       Source: New Artist

🎉 CORRECTIF RÉUSSI!
📁 Fichier corrigé disponible dans results/
🔄 Remplace le fichier discover_weekly précédent
