In [None]:
train.drop(columns=["id", "Surname", "CustomerId"],inplace=True)
test.drop(columns=["id", "Surname"],inplace=True)
y_target=train_copy['Exited']
Xtrain_copy = train_copy.drop(target, axis =1)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xtrain_copy, y_target,
                                                        test_size=0.2,
                                                        random_state=0,
                                                        stratify=y_target)
####  
def store_missing_rows(df, features):
    '''Fonction qui stocke les lignes contenant des valeurs manquantes pour un ensemble donné de fonctionnalités'''
    missing_rows = {}
    # Grouper les lignes par feature manquante et collecter les index des lignes
    for feature in features:
        missing_indices = df.index[df[feature].isnull()].tolist()
        if missing_indices:
            missing_rows[feature] = missing_indices
    return missing_rows

##### 

def fill_missing(train, test, target, max_iterations=10):
    '''
    Iterative Missing Imputer: Updates filled missing values iteratively using KNN 
    imputation for numerical variables and mode imputation for categorical variables
    
    '''
    # Copie du DataFrame train
    train_temp = train.copy()
    if target in train_temp.columns:
        train_temp = train_temp.drop(columns=target)
    
    # Concaténation de train et test
    df = pd.concat([train_temp, test], axis="rows").reset_index(drop=True)
    
    # Sélection des caractéristiques avec des valeurs manquantes
    numeric_features = df.select_dtypes(include=np.number).columns
    categorical_features = df.select_dtypes(include='object').columns
    
    # Imputation des valeurs manquantes pour les variables numériques avec KNNImputer
    if len(numeric_features) > 0:
        imputer_numeric = KNNImputer()
        df[numeric_features] = imputer_numeric.fit_transform(df[numeric_features])
    
    # Imputation des valeurs manquantes pour les variables catégorielles avec le mode
    if len(categorical_features) > 0:
        for feature in categorical_features:
            mode_value = df[feature].mode()[0]
            df[feature] = df[feature].fillna(mode_value)
    
    # Séparation des données train et test
    train[numeric_features] = df.iloc[:train.shape[0]][numeric_features].values
    test[numeric_features] = df.iloc[train.shape[0]:][numeric_features].values
    
    train[categorical_features] = df.iloc[:train.shape[0]][categorical_features].values
    test[categorical_features] = df.iloc[train.shape[0]:][categorical_features].values
    
    return train, test

# Fonction pour obtenir la valeur la plus proche dans une liste
def nearest_val(target, common):
    return min(common, key=lambda x: abs(x - target))

# Fonction pour traiter les valeurs catégorielles rares

def handle_rare_categories(train, test, col_list):
    for col in col_list:
        uncommon = set(test[col].unique()) | set(train[col].unique())
        common = set(test[col].unique()) & set(train[col].unique())
        uncommon -= common
        if uncommon:
            train[col] = train[col].apply(lambda x: nearest_val(x, common) if x not in common else x)
            test[col] = test[col].apply(lambda x: nearest_val(x, common) if x not in common else x)
    return train, test

def OHE(train, test, col, min_percentage=0.0):
    """
    One-Hot Encoding function optimized for feature encoding in train and test datasets.
    Removes the category with the least frequency within each column if it represents less than min_percentage.

    Parameters:
    train (pd.DataFrame): Training dataset.
    test (pd.DataFrame): Test dataset.
    cols (list): List of column names to perform one-hot encoding.
    target_col (str): Name of the target column (to be excluded from test after encoding).
    min_percentage (float): Minimum percentage threshold for category frequency. Default is 0.0.

    Returns:
    pd.DataFrame: Encoded training dataset.
    pd.DataFrame: Encoded test dataset.
    """

    # Copy original dataframes to avoid modifying input data
    train_encode = pd.DataFrame(index=train.index)
    test_encode = pd.DataFrame(index=test.index)
    ohe = "OHE"

    train_value_counts = train[col].value_counts(normalize=True)
    min_freq_category = train_value_counts.idxmin()
    min_freq_percentage = train_value_counts.min()

    if min_freq_percentage < min_percentage:
        min_freq_category_name = f"{col}_{ohe}_{min_freq_category}"
        train_encode = pd.concat([
            train_encode,
            pd.get_dummies(train[col], prefix=f"{col}_OHE", prefix_sep='_')
            .drop(columns=min_freq_category_name)
        ], axis=1)

        test_encode = pd.concat([
            test_encode,
            pd.get_dummies(test[col], prefix=f"{col}_OHE", prefix_sep='_')
            .drop(columns=min_freq_category_name)
        ], axis=1)
    else:
        train_encode = pd.concat([
            train_encode,
            pd.get_dummies(train[col], prefix=f"{col}_OHE", prefix_sep='_')
        ], axis=1)

        test_encode = pd.concat([
            test_encode,
            pd.get_dummies(test[col], prefix=f"{col}_OHE", prefix_sep='_')
        ], axis=1)
    return train_encode, test_encode

def cat_encoding(Xtrain, Xtest, cols_to_encode):
    Xtrain_encode = Xtrain.copy()
    Xtest_encode = Xtest.copy()
    for col in cols_to_encode:
        OHE_train, OHE_test = OHE(Xtrain, Xtest, col, min_percentage=0.005)
        Xtrain_encode = pd.concat([Xtrain_encode, OHE_train], axis = 1).drop(col, axis = 1)
        Xtest_encode = pd.concat([Xtest_encode, OHE_test], axis = 1).drop(col, axis = 1)    
    return   Xtrain_encode, Xtest_encode

def apply_transformations(train, test, col, transformations):
    for transformation in transformations:
        name = transformation["name"]
        func = transformation["func"]
        train[name] = func(train[col].to_numpy().reshape(-1, 1)) 
        test[name] = func(test[col].to_numpy().reshape(-1, 1)) 
        
def transformer(Xtrain, Xtest, Ytrain, cont_cols, target):
    global unimportant_features, overall_best_score, overall_best_col
    unimportant_features = []
    overall_best_score = 0
    overall_best_col = 'none'
    table = pd.DataFrame(columns=['Feature', 'Initial ROC_AUC', 'Transformation', 'Transformed ROC_AUC'])

    for col in cont_cols:
        # {"name": "bx_cx_" + col, "func": lambda x: PowerTransformer(method='box-cox').fit_transform(x)},
        transformations = [
            {"name": "log_" + col, "func": lambda x: np.log1p(x)},
            {"name": "sqrt_" + col, "func": lambda x: np.sqrt(x)},
            {"name": "y_J_" + col, "func": lambda x: PowerTransformer(method='yeo-johnson').fit_transform(x)},
            {"name": "pow_" + col, "func": lambda x: FunctionTransformer(lambda x: np.power(x + 1 - np.min(x), 0.25)).fit_transform(x)},
            {"name": "pow2_" + col, "func": lambda x: FunctionTransformer(lambda x: np.power(x + 1 - np.min(x), 2)).fit_transform(x)},
            {"name": "log_sqrt" + col, "func": lambda x: np.log1p(np.sqrt(x))}
        ]
        #Xtrain, Xtest = fill_missing_numerical(Xtrain, Xtest, target, max_iterations=10)
        apply_transformations(Xtrain, Xtest, col, transformations) 
        
        temp_cols = [transformation["name"] for transformation in transformations] + [col]
        auc_scores = []
        
        for f in temp_cols:
            X = Xtrain[[f]]
            y = Ytrain            
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            auc = []
            for train_idx, val_idx in kf.split(X, y):
                X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
                X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]                
                model = LogisticRegression()
                model.fit(X_train, y_train)
                y_pred = model.predict_proba(X_val)[:,1]
                auc.append(roc_auc_score(y_val, y_pred))
            auc_scores.append((f, np.mean(auc)))
        best_col, best_auc = sorted(auc_scores, key=lambda x: x[1], reverse=True)[0]
        
        if best_auc > overall_best_score:
            overall_best_score = best_auc
            overall_best_col = best_col
            
        cols_to_drop = [f for f in temp_cols if f != best_col]
        final_selection = [f for f in temp_cols if f not in cols_to_drop]
        
        if cols_to_drop:
            unimportant_features += cols_to_drop
        table.loc[len(table)] = [col, np.mean(auc), best_col, best_auc]
        
    Xtrain = Xtrain.drop(unimportant_features, axis = 1)
    Xtest  = Xtest.drop(unimportant_features, axis = 1)

    print(table)
    print("Overall best CV ROC AUC score:", overall_best_score)
    return Xtrain, Xtest, unimportant_features

def generate_features(col1, col2):
        temp_df = pd.DataFrame()  # Temporary dataframe to store the generated columns
        temp_df_test = pd.DataFrame()  # Temporary dataframe for test data

        operations = [('*', lambda x, y: x * y),
                      ('/', lambda x, y: x / (y + 1e-5)),
                      ('/', lambda x, y: y / (x + 1e-5)),
                      ('-', lambda x, y: x - y),
                      ('+', lambda x, y: x + y)]

        for op_name, operation in operations:
            new_col_name = f"{col1}{op_name}{col2}"
            temp_df[new_col_name] = operation(Xtrain[col1], Xtrain[col2])
            temp_df_test[new_col_name] = operation(Xtest[col1], Xtest[col2])

        return temp_df, temp_df_test
    
    
def better_features(Xtrain, Xtest, Ytrain, cols, best_score):
    new_cols = []
    skf = KFold(n_splits=5, shuffle=True, random_state=42)  # Stratified k-fold object
    
    def generate_features(col1, col2):
        temp_df = pd.DataFrame()  # Temporary dataframe to store the generated columns
        temp_df_test = pd.DataFrame()  # Temporary dataframe for test data

        operations = [('*', lambda x, y: x * y),
                      ('/', lambda x, y: x / (y + 1e-5)),
                      ('/', lambda x, y: y / (x + 1e-5)),
                      ('-', lambda x, y: x - y),
                      ('+', lambda x, y: x + y)]
        for op_name, operation in operations:
            new_col_name = f"{col1}{op_name}{col2}"
            temp_df[new_col_name] = operation(Xtrain[col1], Xtrain[col2])
            temp_df_test[new_col_name] = operation(Xtest[col1], Xtest[col2])

        return temp_df, temp_df_test
    # Initialisez une liste pour stocker les résultats
    results = []
    # Boucle sur les colonnes avec tqdm pour afficher une barre de progression
    for i in tqdm(range(len(cols)), desc='Generating Features'):
        col1 = cols[i]
        temp_cols = cols[i+1:]
        # Appel à generate_features pour chaque paire de colonnes
        for col2 in temp_cols:
            temp_df, temp_df_test = generate_features(col1, col2)
            results.append((temp_df, temp_df_test))
    for temp_df, temp_df_test in results:
        for column in temp_df.columns:
            scores = []
            for train_index, val_index in skf.split(Xtrain, Ytrain):
                X_train, X_val = temp_df[column].iloc[train_index].values.reshape(-1, 1), temp_df[column].iloc[val_index].values.reshape(-1, 1)
                y_train, y_val = Ytrain.astype(int).iloc[train_index], Ytrain.astype(int).iloc[val_index]   
                model = HistGradientBoostingClassifier(max_iter=300, learning_rate=0.02, max_depth=6, random_state=42)
                model.fit(X_train, y_train)  
                y_pred = model.predict_proba(X_val)[:,1]
                score = roc_auc_score( y_val, y_pred)
                scores.append(score)    
            mean_score = np.mean(scores)
            if mean_score > best_score:
                new_col_name = temp_df[column].name
                corr_with_other_cols = Xtrain.corrwith(temp_df[column])
                if (corr_with_other_cols.abs().max() < 0.9 or best_score) and corr_with_other_cols.abs().max() != 1:
                    Xtrain[new_col_name] = temp_df[column]
                    Xtest[new_col_name] = temp_df_test[column]
                    new_cols.append(new_col_name)
                    print(f"Added column '{new_col_name}' with ROC AUC Score: {mean_score:.4f} & Correlation {corr_with_other_cols.abs().max():.4f}")
    return  Xtrain,  Xtest, new_cols

def post_processor(train, test):
    # Identifie les colonnes en double dans l'ensemble de données d'entraînement
    duplicate_columns = train.loc[:, train.columns.duplicated()].columns
    
    # Supprime les colonnes en double de l'ensemble de données d'entraînement et de test
    train = train.drop(columns=duplicate_columns)
    test = test.drop(columns=duplicate_columns)
    return train, test
             
