In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, precision_recall_curve,
                             average_precision_score, f1_score)

In [2]:
import pandas as pd

df_train_test = pd.read_parquet("data/test.parquet")
df_train = pd.read_parquet("data/train.parquet")

In [3]:
# Creating cancellation in following ten days column

import numpy as np

cancellation_events = df_train[df_train['page'] == 'Cancellation Confirmation'].copy()
cancellation_events = cancellation_events[['userId', 'time']].rename(columns={'time': 'churn_time'})

df_train = df_train.merge(cancellation_events, on='userId', how='left')

df_train['days_until_churn'] = (df_train['churn_time'] - df_train['time']).dt.total_seconds() / (24 * 3600)

df_train['will_churn_10days'] = ((df_train['days_until_churn'] >= 0) & 
                                   (df_train['days_until_churn'] <= 10)).astype(int)

df_train = df_train.drop(['churn_time', 'days_until_churn'], axis=1)

In [4]:
df_train.describe() #max time is 2018-11-20 so we are going to keep only the rows that are at least 10 days old OR that have churn True

df_train = df_train[(df_train["time"] < "2018-11-10" )| (df_train["will_churn_10days"] == 1)]

In [None]:
import pandas as pd
from tabpfn import TabPFNClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
# from huggingface_hub import login # Non nécessaire si déjà connecté

# --- 1. Préparation des Données (Identique) ---
df = pd.read_csv("features_users_with_target.csv")

y = df['will_churn_10days']
exclude_cols = [
    'Unnamed: 0', 'userId', 'will_churn_10days', 'registration', 
    'first_activity', 'last_activity',
    'days_since_last_activity', # Leakage corrigé
    'page_submit_downgrade', 'page_downgrade', 'churn_signals', 
    'has_visited_downgrade', 'risk_score' # Colonnes suspectes retirées
]
cols_to_remove = [col for col in exclude_cols if col in df.columns]
X = df.drop(columns=cols_to_remove, errors='ignore')

if X['avg_song_length'].isnull().any():
    X['avg_song_length'] = X['avg_song_length'].fillna(X['avg_song_length'].mean())

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X = X.select_dtypes(exclude=['object'])

# Séparation Train-Test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y 
)

# --- 2. Application de TabPFN ---

# CORRECTION FINALE : Bascule forcée sur CPU pour résoudre les problèmes de mémoire MPS
device_used = 'cpu'
classifier = TabPFNClassifier(
    device=device_used, 
    # Autorise l'exécution d'un grand jeu de données sur CPU
    ignore_pretraining_limits=True 
)

print(f"\nDébut de l'entraînement TabPFN sur {device_used.upper()}.")
print("ATTENTION : Le traitement sur CPU sera plus lent, mais le risque d'échec est minime.")

# Entraîner le modèle
classifier.fit(X_train, y_train)

# --- 3. Prédiction (sans batching sur CPU) ---
print("\nDébut de la prédiction...")
y_pred = classifier.predict(X_test) 

# --- 4. Évaluation ---
balanced_acc = balanced_accuracy_score(y_test, y_pred)

print(f"\nRésultat final de TabPFN:")
print(f"Balanced Accuracy (Précision Équilibrée) : {balanced_acc:.4f}")


Début de l'entraînement TabPFN sur CPU.
ATTENTION : Le traitement sur CPU sera plus lent, mais le risque d'échec est minime.

Début de la prédiction...
