# Test 

In [107]:
import pandas as pd

df_test = pd.read_parquet("data/test.parquet")

### Function

In [None]:
# It is an outlier, so we remove it and will put arbitrarily a value to it later
data_clean = df_test[df_test['userId'] != '1261737']

NameError: name 'df_test' is not defined

In [135]:
# Créer une ligne pour user 1261737
new_row = pd.DataFrame({
    'userId': ['1261737'],
    'will_churn_prediction': [0]
})

# Ajouter à ton submission
submission_complete = pd.concat([submission, new_row], ignore_index=True)

print(f"Submission before: {len(submission)} rows")
print(f"Submission after: {len(submission_complete)} rows")

# Vérifier
print(f"\nUser 1261737 in submission: {'1261737' in submission_complete['userId'].values}")

Submission before: 43608 rows
Submission after: 43609 rows

User 1261737 in submission: True


In [None]:
"""
PIPELINE COMPLÈTE : FEATURES TEST SET + PRÉDICTIONS
"""

import pandas as pd
import xgboost as xgb

print("="*80)
print("PIPELINE COMPLÈTE : TRAIN → TEST → PRÉDICTIONS")
print("="*80)
print()

# ============================================================================
# ÉTAPE 1 : CHARGER LE MODÈLE ENTRAÎNÉ
# ============================================================================

print("1. Chargement du modèle entraîné...")

# Option B : Si tu as le modèle en mémoire
model = model_XGB  # (celui retourné par XGB_training)

print("✅ Modèle chargé")
print()

# ============================================================================
# ÉTAPE 2 : CHARGER LE TEST SET
# ============================================================================

print("2. Chargement du test set...")

df_test = data_clean.copy()  # Ton fichier test

print(f"Test set shape: {df_test.shape}")
print(f"Users test: {df_test['userId'].nunique()}")
print()

# ============================================================================
# ÉTAPE 3 : CRÉER LES FEATURES PAR SESSION (TEST SET)
# ============================================================================

print("3. Génération des features par session (test set)...")

# Colonnes nécessaires
required_cols = ['userId', 'sessionId', 'time', 'page', 'registration', 'level', 'userAgent', 'length', 'artist']

# IMPORTANT: Le test set N'A PAS de churn events
# Donc on ne filtre PAS "Cancellation Confirmation"
df_test_clean = df_test[required_cols].copy()

# Générer les features
features_test = create_features_per_session_optimized(
    df_test_clean,
    batch_size=10000,
    verbose=True
)

print(f"✅ Features test générées: {features_test.shape}")
print()

# ============================================================================
# ÉTAPE 4 : AJOUTER LES PRÉFÉRENCES MUSICALES (TEST SET)
# ============================================================================

print("4. Calcul des préférences musicales (test set)...")

music_prefs_test = create_user_preferences_improved(
    df_test,
    encode=True,
    verbose=True
)

# Merger
features_test = features_test.merge(
    music_prefs_test[[
        'userId',
        'favorite_genre_encoded',
        'favorite_artist_encoded',
        'total_songs_listened',
        'unique_artists',
        'unique_genres'
    ]],
    on='userId',
    how='left'
)

# Remplir les NaN
features_test['favorite_genre_encoded'].fillna(-1, inplace=True)
features_test['favorite_artist_encoded'].fillna(-1, inplace=True)
features_test['total_songs_listened'].fillna(0, inplace=True)
features_test['unique_artists'].fillna(0, inplace=True)
features_test['unique_genres'].fillna(0, inplace=True)

print(f"✅ Features test avec musique: {features_test.shape}")
print()

# ============================================================================
# ÉTAPE 5 : PRÉPARER X_test (MÊMES COLONNES QUE TRAIN)
# ============================================================================

print("5. Préparation de X_test...")

# Colonnes à exclure
exclude_cols = [
    'userId',
    'sessionId',
    'session_time',
    'favorite_genre',      # Texte
    'favorite_artist'      # Texte
]

# Enlever les colonnes d'exclusion
features_test_clean = features_test.drop(columns=exclude_cols, errors='ignore')


print(f"✅ X_test shape: {X_test.shape}")
print(f"Colonnes: {X_test.columns.tolist()}")
print()

# ============================================================================
# ÉTAPE 6 : VÉRIFIER QUE LES COLONNES MATCHENT AVEC LE TRAIN
# ============================================================================

print("6. Vérification des colonnes...")

# Récupérer les colonnes utilisées pendant l'entraînement
# Option A : Si tu as sauvegardé les colonnes
# train_cols = pd.read_csv('train_columns.txt', header=None)[0].tolist()

# Option B : Utiliser les feature_names du modèle XGBoost
train_cols = model.get_booster().feature_names

# Vérifier les colonnes manquantes/en trop
missing_cols = set(train_cols) - set(X_test.columns)
extra_cols = set(X_test.columns) - set(train_cols)

if missing_cols:
    print(f"⚠️ Colonnes manquantes dans test: {missing_cols}")
    # Ajouter les colonnes manquantes avec des 0
    for col in missing_cols:
        X_test[col] = 0

if extra_cols:
    print(f"⚠️ Colonnes en trop dans test: {extra_cols}")
    # Enlever les colonnes en trop
    X_test = X_test.drop(columns=list(extra_cols))

# Réordonner les colonnes dans le même ordre que train
X_test = X_test[train_cols]

print(f"✅ Colonnes alignées: {X_test.shape}")
print()

# ============================================================================
# ÉTAPE 7 : PRÉDICTIONS
# ============================================================================

print("7. Génération des prédictions...")

# Prédire les probabilités
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Prédire les classes (avec seuil par défaut 0.5)
y_pred = model.predict(X_test)

# Ajouter au DataFrame
predictions = features_test[['userId', 'sessionId', 'session_time']].copy()
predictions['churn_probability'] = y_pred_proba
predictions['will_churn_prediction'] = y_pred

print(f"✅ Prédictions générées: {predictions.shape}")
print()
print("Aperçu des prédictions:")
print(predictions.head(20))
print()
print("Distribution des prédictions:")
print(predictions['will_churn_prediction'].value_counts())
print()

Unnamed: 0,userId,sessionId,session_time,churn_probability,will_churn_prediction
0,1000655,19711.0,2018-10-03 09:00:59,0.473175,0
1,1000655,36911.0,2018-10-05 22:06:38,0.758766,1
2,1000655,49298.0,2018-10-07 19:32:55,0.799956,1
3,1000655,54275.0,2018-10-18 04:38:45,0.722955,1
4,1000655,96058.0,2018-10-21 14:25:04,0.733780,1
...,...,...,...,...,...
43604,1999996,136422.0,2018-10-29 19:16:14,0.807168,1
43605,1999996,137240.0,2018-10-30 15:24:07,0.822897,1
43606,1999996,140582.0,2018-11-08 00:34:07,0.774387,1
43607,1999996,170036.0,2018-11-13 09:01:28,0.729282,1


In [137]:
# Renommer les colonnes pour Kaggle
submission_final = submission_complete.rename(columns={
    'userId': 'id',
    'will_churn_prediction': 'target'
})

# Vérifier
print("Submission structure:")
print(submission_final.head())
print(f"\nColumns: {submission_final.columns.tolist()}")
print(f"Shape: {submission_final.shape}")

# Vérifier pas de null
print(f"\nNull values: {submission_final.isnull().sum()}")

print("\n✓ Submission ready with columns: id, target")

Submission structure:
        id  sessionId        session_time  churn_probability  target
0  1000655    19711.0 2018-10-03 09:00:59           0.473175       0
1  1000655    36911.0 2018-10-05 22:06:38           0.758766       1
2  1000655    49298.0 2018-10-07 19:32:55           0.799956       1
3  1000655    54275.0 2018-10-18 04:38:45           0.722955       1
4  1000655    96058.0 2018-10-21 14:25:04           0.733780       1

Columns: ['id', 'sessionId', 'session_time', 'churn_probability', 'target']
Shape: (43609, 5)

Null values: id                   0
sessionId            1
session_time         1
churn_probability    1
target               0
dtype: int64

✓ Submission ready with columns: id, target


In [139]:
# Grouper par userId : 1 si au moins un 1, sinon 0
submission_final = submission_final.groupby('id').agg({
    'target': 'max'  # max = 1 si au moins un 1, sinon 0
}).reset_index()

print(f"After deduplication: {len(submission_final)} unique users")
print(f"\nTarget distribution:")
print(submission_final['target'].value_counts())

# Vérifier plus de duplicates
print(f"\nDuplicates: {submission_final['id'].duplicated().sum()}")


After deduplication: 2904 unique users

Target distribution:
target
1    2825
0      79
Name: count, dtype: int64

Duplicates: 0


In [140]:
# 1. Identifier les userId manquants
expected_users = df_test['userId'].unique()
submitted_users = submission_final['id'].unique()

print(f"Expected users: {len(expected_users)}")
print(f"Submitted users: {len(submitted_users)}")

# Trouver les manquants
missing_users = set(expected_users) - set(submitted_users)
print(f"\nMissing users: {len(missing_users)}")
print(f"Missing user IDs: {list(missing_users)[:10]}")  # Afficher les 10 premiers

# 2. Créer des lignes pour les utilisateurs manquants (avec target=0)
missing_rows = pd.DataFrame({
    'id': list(missing_users),
    'target': 0
})

# 3. Ajouter à la submission
submission_complete = pd.concat([submission_final, missing_rows], ignore_index=True)

print(f"\nBefore adding missing: {len(submission_final)} rows")
print(f"After adding missing: {len(submission_complete)} rows")

# 4. Vérifier qu'on a bien 2904
assert len(submission_complete) == 2904, f"Expected 2904, got {len(submission_complete)}"

# 5. Trier par id (optionnel mais propre)
submission_complete = submission_complete.sort_values('id').reset_index(drop=True)

# 6. Sauvegarder
submission_complete.to_csv('submission.csv', index=False)
print("\n✓ Complete submission with all 2904 users saved!")

Expected users: 2903
Submitted users: 2904

Missing users: 0
Missing user IDs: []

Before adding missing: 2904 rows
After adding missing: 2904 rows

✓ Complete submission with all 2904 users saved!


# Train

The competition is to be performed in groups of two. You'll have a report of 4 pages to submit by december 14th, presenting the methods you tested and used. For the defense you'll get 8 minutes of presentations + 7 minutes of questions, including on question on the labs, that may involve writing a code snippet.


Churn prediction 25/26
**Predict churn prediction from streaming service logs**

The goal of the competition is to predict whether or not some users (whose user ids are in the test file) will **churn in the window of 10 days that follows the given observations (ie after "2018-11-20")**. We consider that a user churns when they visit the page **'Cancellation Confirmation'** (edited) 


In [1]:
import pandas as pd

df_train_test = pd.read_parquet("data/test.parquet")
df_train = pd.read_parquet("data/train.parquet")

### Creation of target feature

In [3]:
# Supprimer users qui churnent dans les 7 premiers jours
churners = df_train[df_train['page'] == 'Cancellation Confirmation'].copy()
churners['days_since_start'] = (churners['time'] - df_train['time'].min()).dt.days
early_churners = churners[churners['days_since_start'] <= 7]['userId'].unique()

df_train = df_train[~df_train['userId'].isin(early_churners)]

print(f"Avant: {df_train['userId'].nunique()}, Après: {df_train['userId'].nunique()}, Supprimé: {len(early_churners)}")

Avant: 18304, Après: 18304, Supprimé: 836


In [4]:
# Creating cancellation in following ten days column

import numpy as np

cancellation_events = df_train[df_train['page'] == 'Cancellation Confirmation'].copy()
cancellation_events = cancellation_events[['userId', 'time']].rename(columns={'time': 'churn_time'})

df_train = df_train.merge(cancellation_events, on='userId', how='left')

df_train['days_until_churn'] = (df_train['churn_time'] - df_train['time']).dt.total_seconds() / (24 * 3600)

df_train['will_churn_10days'] = ((df_train['days_until_churn'] >= 0) & 
                                   (df_train['days_until_churn'] <= 10)).astype(int)

df_train = df_train.drop(['churn_time', 'days_until_churn'], axis=1)

In [5]:
df_train.describe() #max time is 2018-11-20 so we are going to keep only the rows that are at least 10 days old OR that have churn True

df_train = df_train[(df_train["time"] < "2018-11-10" )| (df_train["will_churn_10days"] == 1)]

## Preliminary EDA

In [6]:
import pandas as pd
import numpy as np

# Identifier les churners et leur date de churn
churners = df_train[df_train['page'] == 'Cancellation Confirmation'].copy()
churn_dates = churners.groupby('userId')['time'].min().reset_index()
churn_dates.columns = ['userId', 'churn_date']

print(f"Nombre d'utilisateurs qui ont churné : {len(churn_dates)}")
print(f"Taux de churn global : {len(churn_dates) / df_train['userId'].nunique() * 100:.2f}%")
churn_dates['churn_date'] = churn_dates['churn_date'].dt.date
churn_dates

Nombre d'utilisateurs qui ont churné : 3435
Taux de churn global : 19.03%


Unnamed: 0,userId,churn_date
0,1000025,2018-10-18
1,1000083,2018-10-12
2,1000280,2018-11-13
3,1000353,2018-10-22
4,1000503,2018-10-13
...,...,...
3430,1998845,2018-10-24
3431,1998879,2018-10-21
3432,1999022,2018-11-04
3433,1999847,2018-10-18


## Feature preprocessing

In [7]:
print(f"Shape avant nettoyage: {df_train.shape}")
print(f"Mémoire avant: {df_train.memory_usage(deep=True).sum() / 1e9:.2f} GB")

# Garder uniquement les colonnes nécessaires
colonnes_necessaires = ['userId', 'sessionId', 'level', 'userAgent', 'time', 'page', 'length', 'registration']
df_train = df_train[colonnes_necessaires].copy()

print(f"Shape après nettoyage: {df_train.shape}")
print(f"Mémoire après: {df_train.memory_usage(deep=True).sum() / 1e9:.2f} GB")

# Supprimer le leakage
df_train = df_train[~df_train['page'].isin(['Cancel', 'Cancellation Confirmation'])]

Shape avant nettoyage: (14478407, 20)
Mémoire avant: 12.33 GB
Shape après nettoyage: (14478407, 8)
Mémoire après: 5.21 GB


## Sous le capot

### Current function

In [17]:
"""
Fonction ULTRA-OPTIMISÉE par SESSION avec 24 features
Version RAPIDE : ~2 minutes pour 1M sessions

OPTIMISATIONS:
- Feature 25 (avg_time_in_session_without_music_14d) ENLEVÉE (trop lente)
- Features 23, 24 SIMPLIFIÉES (ratios totaux au lieu de moyennes par session)
- consistency_score SIMPLIFIÉ
- consecutive_days_inactive VECTORISÉ
- has_downgraded VECTORISÉ
- Pré-calculs maximisés
"""

import pandas as pd
import numpy as np
from datetime import timedelta
import gc


def create_features_per_session_optimized(
    df: pd.DataFrame,
    batch_size: int = 10000,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Calcule 24 features ULTRA-RAPIDE à chaque nouvelle session
    
    Colonnes requises: userId, sessionId, time, page, registration, level, userAgent, length
    
    Returns:
        DataFrame avec (userId, sessionId, session_time, 24 features)
    """
    
    if verbose:
        print("="*80)
        print("CALCUL DES 24 FEATURES PAR SESSION (VERSION ULTRA-OPTIMISÉE)")
        print("="*80)
        print()
    
    # ========================================================================
    # PRÉPARATION
    # ========================================================================
    
    df = df.copy()
    df['time'] = pd.to_datetime(df['time'])
    df['date'] = df['time'].dt.date
    df['registration'] = pd.to_datetime(df['registration'])
    df = df.sort_values(['userId', 'sessionId', 'time']).reset_index(drop=True)
    
    # Convertir length en float (en secondes)
    df['length'] = pd.to_numeric(df['length'], errors='coerce').fillna(0)
    
    if verbose:
        print("Création des flags...")
    
    # Flags (vectorisé)
    df['is_nextsong'] = (df['page'] == 'NextSong').astype(np.int8)
    df['is_thumbs_up'] = (df['page'] == 'Thumbs Up').astype(np.int8)
    df['is_thumbs_down'] = (df['page'] == 'Thumbs Down').astype(np.int8)
    df['is_error'] = (df['page'] == 'Error').astype(np.int8)
    df['is_help'] = (df['page'] == 'Help').astype(np.int8)
    df['is_settings'] = (df['page'] == 'Settings').astype(np.int8)
    df['is_downgrade'] = (df['page'] == 'Downgrade').astype(np.int8)
    
    # Device type (simplifié)
    df['is_mobile_action'] = df['userAgent'].str.contains('iPhone|iPad|Android', case=False, na=False).astype(np.int8)
    
    if verbose:
        print(f"Dataset: {df.shape}")
        print(f"Users: {df['userId'].nunique()}")
        print(f"Sessions: {df['sessionId'].nunique()}")
        print()
    
    # ========================================================================
    # IDENTIFIER LES SESSIONS
    # ========================================================================
    
    if verbose:
        print("Identification des sessions...")
    
    session_starts = df.groupby(['userId', 'sessionId']).agg({
        'time': 'min',
        'registration': 'first'
    }).reset_index()
    session_starts.columns = ['userId', 'sessionId', 'session_time', 'registration']
    
    total_sessions = len(session_starts)
    num_batches = (total_sessions + batch_size - 1) // batch_size
    
    if verbose:
        print(f"Total sessions: {total_sessions:,}")
        print(f"Batch size: {batch_size:,}")
        print(f"Nombre de batches: {num_batches}")
        print()
    
    # ========================================================================
    # PRÉ-CALCUL : GROUPBY PAR USER
    # ========================================================================
    
    if verbose:
        print("Pré-calcul des groupes par user...")
    
    user_groups = {user_id: group for user_id, group in df.groupby('userId', sort=False)}
    
    # ========================================================================
    # TRAITEMENT PAR BATCH DE SESSIONS
    # ========================================================================
    
    all_features = []
    
    for batch_idx in range(num_batches):
        if verbose:
            print(f"Batch {batch_idx + 1}/{num_batches}...", end=' ')
        
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, total_sessions)
        batch_sessions = session_starts.iloc[start_idx:end_idx]
        
        batch_features = []
        
        for idx, row in batch_sessions.iterrows():
            user_id = row['userId']
            session_id = row['sessionId']
            session_time = row['session_time']
            registration = row['registration']
            
            user_data = user_groups[user_id]
            user_data_before = user_data[user_data['time'] < session_time]
            
            if len(user_data_before) == 0:
                # Première session : features par défaut
                features = {
                    'userId': user_id,
                    'sessionId': session_id,
                    'session_time': session_time,
                    'negative_actions_last7d_vs_avg': 1.0,
                    'consistency_score': 0.0,
                    'consecutive_days_inactive': 0,
                    'session_frequency_change': 1.0,
                    'sessions_last7d_vs_avg': 1.0,
                    'thumbs_down_last_14days': 0,
                    'days_without_thumbs_up': (session_time - registration).days,
                    'has_downgrade_last_15days': 0,
                    'activity_trend_last_14days': 1.0,
                    'songs_listened_last_14days': 0,
                    'help_visits_14d': 0,
                    'error_rate_14d': 0.0,
                    'settings_visits_14d': 0,
                    'frustration_score': 0.0,
                    'is_paid': 0,
                    'has_downgraded': 0,
                    'mobile_usage_ratio_14d': 0.5,
                    'days_since_registration': (session_time - registration).days,
                    'has_ever_paid': 0,
                    'thumbs_down_lifetime': 0,
                    'thumbs_up_lifetime': 0,
                    'frustration_score_lifetime': 0.0,
                    'listening_time_ratio_7d_vs_lifetime': 1.0
                }
                batch_features.append(features)
                continue
            
            # Fenêtres temporelles
            window_14d_start = session_time - timedelta(days=14)
            window_7d_start = session_time - timedelta(days=7)
            window_15d_start = session_time - timedelta(days=15)
            
            data_14d = user_data_before[(user_data_before['time'] >= window_14d_start)]
            data_7d = user_data_before[(user_data_before['time'] >= window_7d_start)]
            data_week2 = user_data_before[(user_data_before['time'] >= window_14d_start) & 
                                          (user_data_before['time'] < window_7d_start)]
            data_15d = user_data_before[(user_data_before['time'] >= window_15d_start)]
            data_lifetime = user_data_before
            
            first_activity = data_lifetime['time'].min()
            lifetime_days = (session_time - first_activity).days + 1
            
            # ============================================================
            # FEATURES ORIGINALES (OPTIMISÉES)
            # ============================================================
            
            # 1. negative_actions_last7d_vs_avg
            thumbs_down_7d = data_7d['is_thumbs_down'].sum()
            negative_7d = thumbs_down_7d + data_7d['is_error'].sum() + data_7d['is_help'].sum() + data_7d['is_settings'].sum()
            negative_lifetime = (data_lifetime['is_thumbs_down'].sum() + 
                                data_lifetime['is_error'].sum() + 
                                data_lifetime['is_help'].sum() + 
                                data_lifetime['is_settings'].sum())
            actions_7d = len(data_7d)
            actions_lifetime = len(data_lifetime)
            negative_7d_norm = negative_7d / max(actions_7d, 1)
            negative_lifetime_norm = negative_lifetime / max(actions_lifetime, 1)
            negative_actions_last7d_vs_avg = negative_7d_norm / max(negative_lifetime_norm, 0.001)
            
            # 2. consistency_score (SIMPLIFIÉ - pas de date_range)
            if len(data_14d) > 0:
                daily_sessions = data_14d.groupby('date')['sessionId'].nunique()
                mean_sessions = daily_sessions.mean()
                std_sessions = daily_sessions.std()
                consistency_score = std_sessions / mean_sessions if mean_sessions > 0 else 0.0
            else:
                consistency_score = 0.0
            
            # 3. consecutive_days_inactive (VECTORISÉ)
            if len(data_14d) > 0:
                active_dates_14d = sorted(data_14d['date'].unique())
                if len(active_dates_14d) > 1:
                    # Calculer les gaps entre dates actives
                    active_dates_series = pd.Series(active_dates_14d)
                    date_diffs = active_dates_series.diff().dt.days
                    consecutive_days_inactive = int(date_diffs.max() - 1) if len(date_diffs) > 0 else 0
                    consecutive_days_inactive = max(0, consecutive_days_inactive)
                else:
                    consecutive_days_inactive = 0
            else:
                consecutive_days_inactive = 14
            
            # 4. session_frequency_change
            sessions_week1 = data_7d['sessionId'].nunique()
            sessions_week2 = data_week2['sessionId'].nunique()
            session_frequency_change = sessions_week1 / max(sessions_week2, 1)
            
            # 5. sessions_last7d_vs_avg
            sessions_7d = data_7d['sessionId'].nunique()
            sessions_lifetime = data_lifetime['sessionId'].nunique()
            sessions_lifetime_avg = sessions_lifetime / max(lifetime_days, 1)
            sessions_7d_avg = sessions_7d / 7
            sessions_last7d_vs_avg = sessions_7d_avg / max(sessions_lifetime_avg, 0.01)
            
            # 6. thumbs_down_last_14days
            thumbs_down_last_14days = data_14d['is_thumbs_down'].sum()
            
            # 7. days_without_thumbs_up
            thumbs_up_data = data_lifetime[data_lifetime['is_thumbs_up'] == 1]
            if len(thumbs_up_data) > 0:
                last_thumbs_up = thumbs_up_data['time'].max()
                days_without_thumbs_up = (session_time - last_thumbs_up).days
            else:
                days_without_thumbs_up = (session_time - registration).days
            
            # 8. has_downgrade_last_15days
            has_downgrade_last_15days = int(data_15d['is_downgrade'].sum() > 0)
            
            # 9. activity_trend_last_14days
            songs_week1 = data_7d['is_nextsong'].sum()
            songs_week2 = data_week2['is_nextsong'].sum()
            activity_trend_last_14days = songs_week1 / max(songs_week2, 1)
            
            # 10. songs_listened_last_14days
            songs_listened_last_14days = data_14d['is_nextsong'].sum()
            
            # 11. help_visits_14d
            help_visits_14d = data_14d['is_help'].sum()
            
            # 12. error_rate_14d
            errors_14d = data_14d['is_error'].sum()
            total_actions_14d = len(data_14d)
            error_rate_14d = errors_14d / max(total_actions_14d, 1)
            
            # 13. settings_visits_14d
            settings_visits_14d = data_14d['is_settings'].sum()
            
            # 14. frustration_score
            frustration_score = (
                thumbs_down_last_14days * 2.0 +
                help_visits_14d * 1.5 +
                settings_visits_14d * 1.5 +
                has_downgrade_last_15days * 3.0
            )
            
            # 15. is_paid
            current_level = data_lifetime.iloc[-1]['level'] if len(data_lifetime) > 0 else 'free'
            is_paid = 1 if current_level == 'paid' else 0
            
            # 16. has_downgraded (VECTORISÉ)
            levels_series = data_lifetime.sort_values('time')['level']
            level_changes = levels_series != levels_series.shift()
            transitions = levels_series[level_changes]
            has_downgraded = 0
            if len(transitions) > 1:
                for i in range(len(transitions) - 1):
                    if transitions.iloc[i] == 'paid' and transitions.iloc[i+1] == 'free':
                        has_downgraded = 1
                        break
            
            # 17. mobile_usage_ratio_14d
            mobile_actions_14d = data_14d['is_mobile_action'].sum()
            mobile_usage_ratio_14d = mobile_actions_14d / max(total_actions_14d, 1)
            
            # 18. days_since_registration
            days_since_registration = (session_time - registration).days
            
            # 19. has_ever_paid
            has_ever_paid = 1 if (data_lifetime['level'] == 'paid').any() else 0
            
            # ============================================================
            # NOUVELLES FEATURES (5 au lieu de 6)
            # ============================================================
            
            # 20. thumbs_down_lifetime
            thumbs_down_lifetime = data_lifetime['is_thumbs_down'].sum()
            
            # 21. thumbs_up_lifetime
            thumbs_up_lifetime = data_lifetime['is_thumbs_up'].sum()
            
            # 22. frustration_score_lifetime
            help_visits_lifetime = data_lifetime['is_help'].sum()
            settings_visits_lifetime = data_lifetime['is_settings'].sum()
            has_ever_downgraded = 1 if data_lifetime['is_downgrade'].sum() > 0 else 0
            
            frustration_score_lifetime = (
                thumbs_down_lifetime * 2.0 +
                help_visits_lifetime * 1.5 +
                settings_visits_lifetime * 1.5 +
                has_ever_downgraded * 3.0
            )
            
            # 23. listening_time_ratio_7d_vs_lifetime (SIMPLIFIÉ - ratio des totaux)
            # Au lieu de moyennes par session, ratio des temps totaux
            total_listening_7d = data_7d[data_7d['is_nextsong'] == 1]['length'].sum()
            total_listening_lifetime = data_lifetime[data_lifetime['is_nextsong'] == 1]['length'].sum()
            
            # Normaliser par nombre de jours
            listening_per_day_7d = total_listening_7d / 7
            listening_per_day_lifetime = total_listening_lifetime / max(lifetime_days, 1)
            
            listening_time_ratio_7d_vs_lifetime = listening_per_day_7d / max(listening_per_day_lifetime, 1.0)
            
            # Feature 24 (avg_listening_time_per_session_last7d) fusionnée dans 23
            # Feature 25 (avg_time_in_session_without_music_14d) ENLEVÉE (trop lente)
            
            # ============================================================
            # ASSEMBLER (24 features)
            # ============================================================
            
            features = {
                'userId': user_id,
                'sessionId': session_id,
                'session_time': session_time,
                'negative_actions_last7d_vs_avg': negative_actions_last7d_vs_avg,
                'consistency_score': consistency_score,
                'consecutive_days_inactive': consecutive_days_inactive,
                'session_frequency_change': session_frequency_change,
                'sessions_last7d_vs_avg': sessions_last7d_vs_avg,
                'thumbs_down_last_14days': int(thumbs_down_last_14days),
                'days_without_thumbs_up': days_without_thumbs_up,
                'has_downgrade_last_15days': has_downgrade_last_15days,
                'activity_trend_last_14days': activity_trend_last_14days,
                'songs_listened_last_14days': int(songs_listened_last_14days),
                'help_visits_14d': int(help_visits_14d),
                'error_rate_14d': error_rate_14d,
                'settings_visits_14d': int(settings_visits_14d),
                'frustration_score': frustration_score,
                'is_paid': is_paid,
                'has_downgraded': has_downgraded,
                'mobile_usage_ratio_14d': mobile_usage_ratio_14d,
                'days_since_registration': days_since_registration,
                'has_ever_paid': has_ever_paid,
                'thumbs_down_lifetime': int(thumbs_down_lifetime),
                'thumbs_up_lifetime': int(thumbs_up_lifetime),
                'frustration_score_lifetime': frustration_score_lifetime,
                'listening_time_ratio_7d_vs_lifetime': listening_time_ratio_7d_vs_lifetime
            }
            
            batch_features.append(features)
        
        if batch_features:
            batch_df = pd.DataFrame(batch_features)
            all_features.append(batch_df)
        
        if verbose:
            print(f"✓ ({len(batch_features)} sessions)")
        
        del batch_features
        gc.collect()
    
    # ========================================================================
    # COMBINER
    # ========================================================================
    
    if verbose:
        print()
        print("Combinaison finale...")
    
    final_df = pd.concat(all_features, ignore_index=True)
    
    if verbose:
        print()
        print("="*80)
        print("✅ 24 FEATURES PAR SESSION CRÉÉES (ULTRA-OPTIMISÉ)")
        print("="*80)
        print(f"Shape: {final_df.shape}")
        print(f"Sessions par user (moyenne): {len(final_df) / final_df['userId'].nunique():.1f}")
        print()
        print("OPTIMISATIONS:")
        print("  ✅ consistency_score simplifié (pas de date_range)")
        print("  ✅ consecutive_days_inactive vectorisé")
        print("  ✅ has_downgraded vectorisé")
        print("  ✅ listening_time_ratio_7d_vs_lifetime simplifié (ratio totaux)")
        print("  ❌ avg_time_in_session_without_music_14d enlevée (trop lente)")
        print()
        print(final_df.head())
    
    return final_df


In [55]:
best_features = create_features_per_session_optimized(df_train)

CALCUL DES 24 FEATURES PAR SESSION (VERSION ULTRA-OPTIMISÉE)

Création des flags...
Dataset: (14471537, 17)
Users: 18048
Sessions: 137078

Identification des sessions...
Total sessions: 175,059
Batch size: 10,000
Nombre de batches: 18

Pré-calcul des groupes par user...
Batch 1/18... ✓ (10000 sessions)
Batch 2/18... ✓ (10000 sessions)
Batch 3/18... ✓ (10000 sessions)
Batch 4/18... ✓ (10000 sessions)
Batch 5/18... ✓ (10000 sessions)
Batch 6/18... ✓ (10000 sessions)
Batch 7/18... ✓ (10000 sessions)
Batch 8/18... ✓ (10000 sessions)
Batch 9/18... ✓ (10000 sessions)
Batch 10/18... ✓ (10000 sessions)
Batch 11/18... ✓ (10000 sessions)
Batch 12/18... ✓ (10000 sessions)
Batch 13/18... ✓ (10000 sessions)
Batch 14/18... ✓ (10000 sessions)
Batch 15/18... ✓ (10000 sessions)
Batch 16/18... ✓ (10000 sessions)
Batch 17/18... ✓ (10000 sessions)
Batch 18/18... ✓ (5059 sessions)

Combinaison finale...

✅ 24 FEATURES PAR SESSION CRÉÉES (ULTRA-OPTIMISÉ)
Shape: (175059, 26)
Sessions par user (moyenne): 9.7


In [56]:
best_features.to_csv('best_features.csv', index = False)

In [57]:
"""
Fonction améliorée pour calculer les préférences musicales par user
+ Encodage pour le ML
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


# ============================================================================
# MAPPING DES GENRES
# ============================================================================

GENRE_MAPPING = {
    'Kings Of Leon': 'Rock Alternatif',
    'Coldplay': 'Pop Rock',
    'Florence + The Machine': 'Indie Pop',
    'The Black Keys': 'Blues Rock',
    'Björk': 'Electronic',
    'BjÃƒÂ¶rk': 'Electronic',
    'Muse': 'Rock Alternatif',
    'Jack Johnson': 'Folk Rock',
    'Dwight Yoakam': 'Country',
    'Justin Bieber': 'Pop',
    'Train': 'Pop Rock',
    'Eminem': 'Hip-Hop',
    'Radiohead': 'Rock Alternatif',
    'Taylor Swift': 'Pop',
    'Alliance Ethnik': 'Hip-Hop',
    'The Killers': 'Rock Alternatif',
    'Linkin Park': 'Metal Alternatif',
    'OneRepublic': 'Pop Rock',
    'Metallica': 'Heavy Metal',
    'John Mayer': 'Pop Rock',
    'Evanescence': 'Metal Alternatif'
}


def create_user_preferences_improved(
    df: pd.DataFrame, 
    genre_map: dict = GENRE_MAPPING,
    encode: bool = True,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Calcule les préférences musicales par user de manière robuste
    
    Args:
        df: DataFrame avec colonnes userId, page, artist, song
        genre_map: Dictionnaire {artiste: genre}
        encode: Si True, encode les variables catégorielles pour ML
        verbose: Afficher les statistiques
        
    Returns:
        DataFrame avec userId + préférences (encodées si encode=True)
    """
    
    if verbose:
        print("="*80)
        print("CALCUL DES PRÉFÉRENCES MUSICALES PAR USER")
        print("="*80)
        print()
    
    # ========================================================================
    # VALIDATION
    # ========================================================================
    
    required_cols = ['userId', 'page', 'artist', 'song']
    missing = [col for col in required_cols if col not in df.columns]
    
    if missing:
        raise ValueError(f"Colonnes manquantes: {missing}. Disponibles: {df.columns.tolist()}")
    
    # ========================================================================
    # FILTRAGE ET NETTOYAGE
    # ========================================================================
    
    if verbose:
        print(f"Dataset initial: {df.shape}")
    
    # Filtrer NextSong uniquement
    df_songs = df[df['page'] == 'NextSong'].copy()
    
    if verbose:
        print(f"Événements NextSong: {len(df_songs):,}")
    
    # Nettoyer les NaN
    before_clean = len(df_songs)
    df_songs = df_songs.dropna(subset=['artist', 'song', 'userId'])
    after_clean = len(df_songs)
    
    if verbose:
        print(f"Après nettoyage NaN: {after_clean:,} (supprimé: {before_clean - after_clean:,})")
    
    # Convertir userId en string
    df_songs['userId'] = df_songs['userId'].astype(str)
    
    # ========================================================================
    # MAPPING DES GENRES
    # ========================================================================
    
    df_songs['genre'] = df_songs['artist'].map(genre_map).fillna('Autre')
    
    if verbose:
        genre_dist = df_songs['genre'].value_counts()
        print(f"\nDistribution des genres:")
        print(genre_dist.head(10))
        print(f"Artistes 'Autre' (non mappés): {(df_songs['genre'] == 'Autre').sum():,}")
        print()
    
    # ========================================================================
    # CALCUL DES PRÉFÉRENCES PAR USER
    # ========================================================================
    
    if verbose:
        print("Calcul des préférences par user...")
    
    # Fonction robuste pour trouver le mode
    def safe_mode(series):
        """Retourne le mode, ou None si vide"""
        if series.empty:
            return None
        modes = series.mode()
        return modes.iloc[0] if len(modes) > 0 else None
    
    # Groupby et agrégation
    favorites = df_songs.groupby('userId').agg(
        favorite_artist=('artist', safe_mode),
        favorite_song=('song', safe_mode),
        favorite_genre=('genre', safe_mode),
        total_songs_listened=('song', 'count'),  # Bonus: volume d'écoute
        unique_artists=('artist', 'nunique'),     # Bonus: diversité artistes
        unique_genres=('genre', 'nunique')        # Bonus: diversité genres
    ).reset_index()
    
    # Gérer les None (users sans données valides)
    favorites['favorite_artist'] = favorites['favorite_artist'].fillna('Unknown')
    favorites['favorite_song'] = favorites['favorite_song'].fillna('Unknown')
    favorites['favorite_genre'] = favorites['favorite_genre'].fillna('Unknown')
    
    if verbose:
        print(f"Users avec préférences: {len(favorites):,}")
        print()
        print("Statistiques:")
        print(f"  Moyenne songs écoutées: {favorites['total_songs_listened'].mean():.1f}")
        print(f"  Moyenne artistes uniques: {favorites['unique_artists'].mean():.1f}")
        print(f"  Moyenne genres uniques: {favorites['unique_genres'].mean():.1f}")
        print()
    
    # ========================================================================
    # ENCODAGE POUR ML (si demandé)
    # ========================================================================
    
    if encode:
        if verbose:
            print("Encodage des variables catégorielles...")
        
        # Option 1: Label Encoding (compact, bon pour tree-based)
        le_artist = LabelEncoder()
        le_song = LabelEncoder()
        le_genre = LabelEncoder()
        
        favorites['favorite_artist_encoded'] = le_artist.fit_transform(favorites['favorite_artist'])
        favorites['favorite_song_encoded'] = le_song.fit_transform(favorites['favorite_song'])
        favorites['favorite_genre_encoded'] = le_genre.fit_transform(favorites['favorite_genre'])
        
        if verbose:
            print(f"  favorite_artist: {len(le_artist.classes_)} classes")
            print(f"  favorite_song: {len(le_song.classes_)} classes")
            print(f"  favorite_genre: {len(le_genre.classes_)} classes")
            print()
        
        # Garder aussi les versions non-encodées pour référence
        # (utile pour l'interprétation)
    
    if verbose:
        print("="*80)
        print("✅ PRÉFÉRENCES CALCULÉES")
        print("="*80)
        print()
        print(favorites.head(10))
        print()
    
    return favorites


# ============================================================================
# FONCTION ALTERNATIVE : ONE-HOT ENCODING
# ============================================================================

def create_user_preferences_onehot(
    df: pd.DataFrame,
    genre_map: dict = GENRE_MAPPING,
    top_n_artists: int = 50,
    verbose: bool = True
) -> pd.DataFrame:
    """
    Version avec One-Hot Encoding (meilleur pour genres, plus lourd pour artistes)
    
    Args:
        df: DataFrame source
        genre_map: Mapping artistes → genres
        top_n_artists: Garder seulement les N artistes les plus populaires
        verbose: Afficher stats
        
    Returns:
        DataFrame avec userId + préférences one-hot encodées
    """
    
    # Calculer préférences de base
    favorites = create_user_preferences_improved(df, genre_map, encode=False, verbose=verbose)
    
    if verbose:
        print("One-Hot Encoding...")
    
    # One-hot pour genres (peu de classes)
    genre_dummies = pd.get_dummies(favorites['favorite_genre'], prefix='genre')
    
    # Pour artistes : garder seulement top N (sinon trop de colonnes)
    top_artists = favorites['favorite_artist'].value_counts().head(top_n_artists).index
    favorites['favorite_artist_grouped'] = favorites['favorite_artist'].apply(
        lambda x: x if x in top_artists else 'Other'
    )
    artist_dummies = pd.get_dummies(favorites['favorite_artist_grouped'], prefix='artist')
    
    # Combiner
    result = pd.concat([
        favorites[['userId', 'total_songs_listened', 'unique_artists', 'unique_genres']],
        genre_dummies,
        artist_dummies
    ], axis=1)
    
    if verbose:
        print(f"\nShape finale: {result.shape}")
        print(f"Colonnes one-hot créées: {result.shape[1] - 4}")
        print()
    
    return result

In [58]:
train = pd.read_parquet("data/train.parquet")

In [1]:
# Fonction pour aggréger les deux dataset de features

import pandas as pd

# ============================================================================
# 1. CHARGER DONNÉES
# ============================================================================

df_full = train.copy()

# ============================================================================
# 2. CALCULER PRÉFÉRENCES MUSICALES
# ============================================================================

music_prefs = create_user_preferences_improved(
    df_full,
    encode=True,
    verbose=True
)

# ============================================================================
# 3. MERGER AVEC LES FEATURES DE SESSIONS
# ============================================================================

features_sessions = best_features

features_final = features_sessions.merge(
    music_prefs[[
        'userId', 
        'favorite_genre_encoded',      # Pour ML
        'favorite_artist_encoded',     # Pour ML
        'total_songs_listened',        # Volume
        'unique_artists',              # Diversité
        'unique_genres',               # Diversité
        'favorite_genre',              # Pour interprétation (garder aussi le texte)
        'favorite_artist'              # Pour interprétation
    ]], 
    on='userId', 
    how='left'
)

# Remplir les NaN (users sans préférences)
features_final['favorite_genre_encoded'].fillna(-1, inplace=True)
features_final['favorite_artist_encoded'].fillna(-1, inplace=True)
features_final['total_songs_listened'].fillna(0, inplace=True)
features_final['unique_artists'].fillna(0, inplace=True)
features_final['unique_genres'].fillna(0, inplace=True)

print(f"Shape finale: {features_final.shape}")
print(features_final.head())

# ============================================================================
# 4. SAUVEGARDER
# ============================================================================

features_final.to_csv('features_with_music_encoded.csv', index=False)
print("✅ Sauvegardé avec préférences musicales encodées")

NameError: name 'train' is not defined

In [88]:
features_final

Unnamed: 0,userId,sessionId,session_time,negative_actions_last7d_vs_avg,consistency_score,consecutive_days_inactive,session_frequency_change,sessions_last7d_vs_avg,thumbs_down_last_14days,days_without_thumbs_up,...,has_ever_paid,thumbs_down_lifetime,thumbs_up_lifetime,frustration_score_lifetime,listening_time_ratio_7d_vs_lifetime,favorite_genre_encoded,favorite_artist_encoded,total_songs_listened,unique_artists,unique_genres
0,1000025,23706,2018-10-02 08:59:29,1.000000,0.000000,0,1.000000,1.000000,0,83,...,0,0,0,0.0,1.000000,0.0,405.0,1662.0,1162.0,11.0
1,1000025,31688,2018-10-02 18:12:22,1.000000,,0,1.000000,0.142857,0,0,...,0,0,1,1.5,0.142857,0.0,405.0,1662.0,1162.0,11.0
2,1000025,39243,2018-10-04 01:04:35,1.000000,0.471405,0,2.000000,0.285714,5,0,...,1,5,26,19.0,0.285714,0.0,405.0,1662.0,1162.0,11.0
3,1000025,42490,2018-10-05 01:36:46,1.000000,0.433013,0,3.000000,0.428571,8,0,...,1,8,38,26.5,0.428571,0.0,405.0,1662.0,1162.0,11.0
4,1000025,45191,2018-10-06 22:09:33,1.000000,0.400000,0,4.000000,0.714286,8,2,...,1,8,38,26.5,0.714286,0.0,405.0,1662.0,1162.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175054,1999905,94383,2018-10-20 15:00:08,0.000000,0.433013,4,0.333333,0.457143,1,12,...,0,2,5,7.0,0.250745,0.0,35.0,259.0,230.0,10.0
175055,1999905,105570,2018-10-23 13:25:14,0.000000,0.000000,4,2.000000,0.904762,1,2,...,0,2,6,7.0,0.353185,0.0,35.0,259.0,230.0,10.0
175056,1999905,115721,2018-10-26 06:51:51,0.622857,0.000000,4,1.000000,0.857143,2,2,...,0,3,8,9.0,0.935355,0.0,35.0,259.0,230.0,10.0
175057,1999905,126577,2018-10-26 15:25:08,0.498990,0.000000,2,3.000000,1.178571,1,3,...,0,3,8,9.0,1.231033,0.0,35.0,259.0,230.0,10.0


In [89]:
churn_time = churners[['userId', 'time']].copy()

In [90]:
features_clean  = features_final.copy()

In [91]:
features_clean.to_csv('features_clean.csv', index=False)

In [92]:
churn_time.to_csv('churn_time.csv', index = False)

In [93]:
churn_time

Unnamed: 0,userId,time
1222,1749042,2018-10-21 01:16:24
5923,1222580,2018-10-30 23:17:30
17878,1385500,2018-11-17 04:54:09
46340,1144647,2018-10-28 02:43:58
58317,1240184,2018-11-14 21:43:36
...,...,...
17123383,1835694,2018-10-09 14:49:14
17167280,1895668,2018-11-15 14:21:59
17233170,1353786,2018-11-12 23:15:05
17269806,1652329,2018-11-02 17:32:12


In [2]:
# Drop des colonnes de textes si besoin
features_final.drop(columns = ['favorite_genre', 'favorite_artist'], inplace = True)

NameError: name 'features_final' is not defined

In [3]:
# On rajoute la colonne target
import pandas as pd

# Charger
features = pd.read_csv('features_clean.csv')
churn_time = pd.read_csv('churn_time.csv')

# Convertir
features['session_time'] = pd.to_datetime(features['session_time'])
churn_time['time'] = pd.to_datetime(churn_time['time'])
churn_time = churn_time.rename(columns={'time': 'churn_date'})

# Merger
features = features.merge(churn_time[['userId', 'churn_date']], on='userId', how='left')

# Target
features['days_until_churn'] = (features['churn_date'] - features['session_time']).dt.days
features['will_churn_10days'] = ((features['days_until_churn'] >= 0) & 
                                  (features['days_until_churn'] <= 10)).astype(int)

# Drop
features = features.drop(['churn_date', 'days_until_churn'], axis=1)

print(features['will_churn_10days'].value_counts())
features.to_csv('features_with_target.csv', index=False)

will_churn_10days
0    156402
1     18657
Name: count, dtype: int64


### Application of the function to the whole dataset

### Remove all previous sessions from churners

In [4]:
import pandas as pd

def keep_only_consistent_users(features):
    """
    Conserve uniquement :
    - Les utilisateurs qui n'ont JAMAIS churné (tous les will_churn_10days=0).
    - Les sessions avec will_churn_10days=1 (provenant des utilisateurs qui churnent).
    
    Retire : les sessions avec will_churn_10days=0 provenant d'utilisateurs qui churnent ultérieurement.
    """
    
    # 1. Calculer le statut de churn maximal pour chaque utilisateur (1 si l'utilisateur a churné au moins une fois)
    max_churn_per_user = features.groupby('userId')['will_churn_10days'].transform('max')
    
    # 2. Créer le masque de filtrage
    # On garde les lignes si :
    # A) L'utilisateur est un "non-churner" (max_churn_per_user == 0)
    # OU
    # B) La session elle-même est une session de churn (features['will_churn_10days'] == 1)
    mask = (max_churn_per_user == 0) | (features['will_churn_10days'] == 1)
    
    # 3. Appliquer le filtre
    features_filtered = features[mask].copy()
    
    return features_filtered

In [97]:
features = pd.read_csv('features_with_target.csv')  # Déjà nettoyé des post-churn
df_consistant = keep_only_consistent_users(features)
df_consistant.to_csv('df_consistant.csv', index = False)

## Model selection

In [100]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
import numpy as np

def XGB_training(df_features):

    # Prepare data
    feature_cols = [col for col in df_features.columns 
                    if col not in ['userId', 'prediction_date','session_start','session_id', 'session_time','time','registration', 'will_churn_10days']]

    X = df_features[feature_cols].fillna(0)
    y = df_features['will_churn_10days']

    # Split (NO resampling here!)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Calculate scale_pos_weight: ratio of negative to positive class
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    print(f"Class distribution in train:")
    print(y_train.value_counts())
    print(f"\nCalculated scale_pos_weight: {scale_pos_weight:.2f}")

    # XGBoost with optimal parameters for churn prediction
    model = xgb.XGBClassifier(
        # Handle imbalanced classes
        scale_pos_weight=scale_pos_weight,  # CRITICAL for imbalanced data
        
        # Model complexity
        n_estimators=200,                   # Number of boosting rounds
        max_depth=6,                        # Tree depth (prevent overfitting)
        learning_rate=0.01,                 # Lower = more robust but slower
        
        # Optimization
        objective='binary:logistic',        # Binary classification,                 
        random_state=42,
        n_jobs=-1
    )

    # Train (simplified - no need to pass eval_metric again)
    print("\nTraining XGBoost model...")
    model.fit(
        X_train, y_train,
    )

    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    print(f"balanced accuracy: {balanced_accuracy_score(y_test, y_pred)}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

    # Feature importance
    import pandas as pd
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 30 Most Important Features:")
    print(feature_importance.head(30).to_string(index=False))
    return X_test, y_test, model


In [102]:
X_test, y_test, model_XGB = XGB_training(df_consistant)

Class distribution in train:
will_churn_10days
0    105729
1     14926
Name: count, dtype: int64

Calculated scale_pos_weight: 7.08

Training XGBoost model...
balanced accuracy: 0.8045054171390154
ROC-AUC: 0.8896

Confusion Matrix:
[[20951  5482]
 [  685  3046]]

Classification Report:
              precision    recall  f1-score   support

    No Churn       0.97      0.79      0.87     26433
       Churn       0.36      0.82      0.50      3731

    accuracy                           0.80     30164
   macro avg       0.66      0.80      0.68     30164
weighted avg       0.89      0.80      0.83     30164


Top 30 Most Important Features:
                            feature  importance
                  frustration_score    0.186231
               thumbs_down_lifetime    0.175407
         frustration_score_lifetime    0.092902
                     unique_artists    0.054654
               total_songs_listened    0.054346
         songs_listened_last_14days    0.045495
             mobi