# Test

In [1]:
import pandas as pd
import re

In [2]:
all_format = ["Basic","Sequential", "Encoded", "Wide"]

In [3]:
class TransactionDf():

    def __init__(self,file_path,header=False,target_column=None,separator=",",formatting="Basic"):
        self.file_paths = [file_path]
        self.headers = [header]
        self.target_columns = [target_column]
        self.separators = [separator]
        self.dfs = []
        self.load_transactions(file_path,header=header,target_column=target_column,sep=separator,formatting=formatting)
    
    def load_transaction_csv(self,file_path,header,target_column,sep,formatting):
        if header:
            transactions = pd.read_csv(file_path)
        else :
            transactions = pd.read_csv(file_path,header=None)
        
        if formatting == "Wide":
            #   transaction_id, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
            #   items, "bread,milk,eggs", "bread,butter", ...
            formatting = "Basic"
            transactions = transactions.T

        if formatting == "Basic":
            #   transaction_id,items
            #   1,"bread,milk,eggs"
            #   2,"bread,butter"

            # check if there is a transaction_id column
            if len(transactions.columns) == 1:
                target_column = transactions.columns[0]
            elif target_column not in transactions.columns:
                raise ValueError(f"the targeted column {target_column} is not in {transactions.columns}")
            if sep:
                transactions = transactions[transactions[target_column].apply(lambda x: isinstance(x, str))]
                transactions[target_column] = transactions[target_column].apply(lambda x: [i.strip() for i in x.split(",")])
            

            # all this hot encode a column and keep all the others column
            transactions_exploded = transactions[target_column].explode()

            transactions_dummies = pd.get_dummies(transactions_exploded)

            #0_ in order to make a sequential feature
            transactions_dummies.columns = ['0_' + col for col in transactions_dummies.columns]

            transactions_dummies = transactions_dummies.groupby(transactions_dummies.index).sum()

            transactions = transactions.drop(columns=[target_column]).join(transactions_dummies)
        elif formatting == "Sequential":
            #   client_id, sequence
            #    C1,"(bread,milk),(butter)"
            #    C2,"(bread),(butter,cheese)"
            all_items = set()
            parsed_sequences = []

            for seq in transactions[target_column]:
                all_transactions = re.findall(r"\((.*?)\)", seq)
                steps = [t.split(",") for t in all_transactions]
                steps = [[item.strip() for item in step if item.strip()] for step in steps]
                parsed_sequences.append(steps)
                for step in steps:
                    all_items.update(step)

            all_items = sorted(all_items)
            print(all_items)
            print("seq", parsed_sequences)

            output_rows = []

            for i in range(len(transactions)):
                client_id = transactions.loc[i, "client_id"]
                steps = parsed_sequences[i]

                row = {"client_id": client_id}
                max_steps = len(steps)
                for i in range(max_steps):
                    for item in all_items:
                        row[f"{i}_{item}"] = 1 if item in steps[i] else 0
                output_rows.append(row)
            transactions = pd.DataFrame(output_rows).fillna(0)
        
        # format encoded need nothing
        return transactions

    def load_transactions(self,file_path,header=True,target_column=None,sep=",",formatting="Basic"):
        " Give pandas dataset of transaction "
        res = None
        if file_path[-4:] == ".csv":
            try :
                res = self.load_transaction_csv(file_path,header,target_column,sep,formatting)
            except ValueError as e:
                print(f"Error: {e}")
        
        
        if res is not None:
            self.dfs.append(res)
    
    def displays(self):
        for df in self.dfs:
            display(df)
    
    def combine(self,indexes=[],column_to_check=None):
        "Combine 2 Databases given index [x,y] with a column to keep (transaction name), display database"
        if len(self.dfs) == 2:
            indexes = [0,1]
        elif (indexes == [] or max(indexes) >= len(self.dfs)) :
            return
        res = None
        if column_to_check is not None:
            # Merge using outer join to keep all names
            df_merged = pd.merge(self.dfs[indexes[0]], self.dfs[indexes[1]], on=column_to_check, how='outer', suffixes=('_1', '_2'))

            df_merged = df_merged.fillna(0)
            numeric_cols = [c for c in df_merged.columns if c != column_to_check]
            df_merged[numeric_cols] = df_merged[numeric_cols].astype(int)

            # Combine columns with similar meaning
            all_columns = set(self.dfs[indexes[0]].columns).union(self.dfs[indexes[1]].columns) - {column_to_check}
            for col in all_columns:
                # Find all columns corresponding to this original column (with _1/_2 suffixes or exact match)
                cols_to_sum = [c for c in df_merged.columns if c == col or c.startswith(col + "_")]
                if cols_to_sum:
                    df_merged[col] = df_merged[cols_to_sum].sum(axis=1)
                    if len(cols_to_sum) == 2: 
                        df_merged.drop(columns=cols_to_sum, inplace=True, errors='ignore')
            res = df_merged
            res = res[sorted(res.columns)]

        return res
    
    def size(self):
        return len(self.dfs)
        

In [4]:
T_df = TransactionDf('../data/stupid.csv',header=True,target_column="Articles")
T_df.displays()

Unnamed: 0,Transaction,0_beurre,0_confiture,0_lait,0_pain
0,T1,0,0,1,1
1,T2,1,0,0,1
2,T3,1,1,1,0
3,T4,1,0,1,1
4,T5,1,0,1,0


In [5]:
T_df.load_transactions('../data/wide.csv',header=True,target_column="sequence",formatting="Wide")
T_df.displays()
#display(T_df.combine(column_to_check="Transaction"))

Unnamed: 0,Transaction,0_beurre,0_confiture,0_lait,0_pain
0,T1,0,0,1,1
1,T2,1,0,0,1
2,T3,1,1,1,0
3,T4,1,0,1,1
4,T5,1,0,1,0


Unnamed: 0,0_bread,0_butter,0_cheese,0_eggs,0_items,0_milk,0_yogurt
transaction_id,0,0,0,0,1,0,0
1,1,0,0,1,0,1,0
2,1,1,0,0,0,0,0
3,0,1,1,0,0,1,0
4,1,1,0,0,0,1,0
5,1,0,0,1,0,0,0
6,0,0,1,0,0,1,1
7,1,1,0,1,0,1,0
8,0,0,1,0,0,0,1
9,1,1,0,1,0,0,0


## Etape 1 : Extraction
- Implémenter/adapter un algorithme de fouille exhaustive de données pour produire un pool P de motifs avec un seuil bas ; fournir
pré-traitement et binarisation/agrégation.

- Calculer pour chaque motif des métriques standard : support,
confidence, lift, couverture, longueur.


In [6]:
from mlxtend.frequent_patterns import fpgrowth, association_rules

# Préparer le DataFrame pour fpgrowth :
df_fp = T_df.dfs[0].copy()
df_fp = df_fp.astype(bool)

# Drop Transaction column
if 'Transaction' in df_fp.columns:
    df_fp = df_fp.drop(columns=['Transaction'])
    
print('Colonnes utilisées pour fpgrowth :', df_fp.columns.tolist())
print(df_fp.head())


# 1. Extraction des itemsets fréquents avec un seuil de support bas
# TODO pouvoir mod le seuil sur UI
min_support_seuil_bas = 0.33

frequent_itemsets = fpgrowth(df_fp, min_support=min_support_seuil_bas, use_colnames=True)

print(f"\nNombre d'itemsets fréquents trouvés (seuil bas) : {len(frequent_itemsets)}")
print(frequent_itemsets.head())


Colonnes utilisées pour fpgrowth : ['0_beurre', '0_confiture', '0_lait', '0_pain']
   0_beurre  0_confiture  0_lait  0_pain
0     False        False    True    True
1      True        False   False    True
2      True         True    True   False
3      True        False    True    True
4      True        False    True   False

Nombre d'itemsets fréquents trouvés (seuil bas) : 6
   support            itemsets
0      0.8            (0_lait)
1      0.6            (0_pain)
2      0.8          (0_beurre)
3      0.4    (0_pain, 0_lait)
4      0.4  (0_pain, 0_beurre)


In [7]:
# 2. Génération des règles d'association à partir des itemsets fréquents
P_pool_rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)

# Calcul des métriques
P_pool_rules['length'] = P_pool_rules['antecedents'].apply(lambda x: len(x)) + P_pool_rules['consequents'].apply(lambda x: len(x))

P_pool_rules.rename(columns={'antecedent support': 'coverage'}, inplace=True)

cols_to_keep = ['antecedents', 'consequents', 'support', 'confidence', 'lift']
if 'coverage' in P_pool_rules.columns:
    cols_to_keep.append('coverage')
    
cols_to_keep.append('length')
P_pool_rules = P_pool_rules[cols_to_keep]

print(f"Pool P de {len(P_pool_rules)} motifs (règles) généré :")
print(P_pool_rules)

Pool P de 6 motifs (règles) généré :
  antecedents consequents  support  confidence      lift  coverage  length
0    (0_pain)    (0_lait)      0.4    0.666667  0.833333       0.6       2
1    (0_lait)    (0_pain)      0.4    0.500000  0.833333       0.8       2
2    (0_pain)  (0_beurre)      0.4    0.666667  0.833333       0.6       2
3  (0_beurre)    (0_pain)      0.4    0.500000  0.833333       0.8       2
4  (0_beurre)    (0_lait)      0.6    0.750000  0.937500       0.8       2
5    (0_lait)  (0_beurre)      0.6    0.750000  0.937500       0.8       2


## Etape 2 : Échantillonnage interactif
- Définir au moins une stratégie de scoring composite (ex. combinaison normalisée de support, lift, surprise, pénalité de redondance).
- Implémenter un algorithme d’échantillonnage pondéré (importance
sampling ou MCMC léger) paramétrable par l’utilisateur (taille k,
remise/non-remise, ...).
- Intégrer un mécanisme simple de feedback utilisateur (like/dislike)
qui ré-pondère les motifs et permet une boucle interactive.

In [8]:
# Définir au moins une stratégie de scoring composite (ex. combinaison normalisée de support, lift, surprise, pénalité de redondance).

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Copie du pool pour ne pas modifier l'original
P_sampling = P_pool_rules.copy()

# Normalisation des métriques (exemple : lift et confidence)
metrics_to_normalize = ['lift', 'confidence']
P_sampling[metrics_to_normalize] = scaler.fit_transform(P_sampling[metrics_to_normalize])

# Définition d'un score composite simple (pondération 60% lift, 40% confidence)

def composite_score(row, w_support = 0.2, w_lift = 0.2, w_conf = 0.2,  w_surprise = 0.2, w_redundancy_penalty = 0.2): # TODO: ajuster la formule selon les besoins / pouvoir changer le type de score sélectionné via UI
    return w_lift * row['lift'] + w_conf * row['confidence'] + w_support * row['support'] + w_surprise * (1 - row['support']) - w_redundancy_penalty * (row['length'] / 10)
P_sampling['composite_score'] = P_sampling.apply(composite_score, axis=1)

# Ajout du poids de feedback (initialisé à 1) # TODO : intégrer feedback utilisateur réel via UI
P_sampling['feedback_weight'] = 1.0

# Le poids final pour l'échantillonnage sera (score * feedback)
P_sampling['final_sampling_weight'] = P_sampling['composite_score'] * P_sampling['feedback_weight']


print("\nPool P avec scores composites et poids de feedback :")
print(P_sampling[['antecedents', 'consequents', 'composite_score', 'feedback_weight', 'final_sampling_weight']].head())


Pool P avec scores composites et poids de feedback :
  antecedents consequents  composite_score  feedback_weight  \
0    (0_pain)    (0_lait)         0.293333              1.0   
1    (0_lait)    (0_pain)         0.160000              1.0   
2    (0_pain)  (0_beurre)         0.293333              1.0   
3  (0_beurre)    (0_pain)         0.160000              1.0   
4  (0_beurre)    (0_lait)         0.560000              1.0   

   final_sampling_weight  
0               0.293333  
1               0.160000  
2               0.293333  
3               0.160000  
4               0.560000  


In [9]:
# Implémenter un algorithme d’échantillonnage pondéré (importance sampling ou MCMC léger) paramétrable par l’utilisateur (taille k, remise/non-remise, ...).
import numpy as np

# Proposal: index uniform
# Acceptance: min(1, w_prop / w_cur) (similaire à Metropolis)
# replace=True : renvoie k échantillons (avec répétition)
# replace=False: renvoie k échantillons distincts (maximum max_iters itérations)
def light_mcmc(P_df, k=10, replace=True, init_idx=None, max_iters=10000, random_state=None):
   
    weights = P_df['final_sampling_weight'].values.copy()
    weights = np.maximum(weights, 0.0)
    
    # Si tous les poids sont nuls, utiliser des poids uniformes
    if weights.sum() == 0:
        weights = np.ones_like(weights)

    rng = np.random.default_rng(random_state)
    
    n = len(weights)
    if n == 0:
        return P_df.copy()

    if init_idx is None:
        cur = int(rng.integers(n))
    else:
        cur = int(init_idx) % n

    samples = []
    visited = set()
    iters = 0

    while (len(samples) < k) and (iters < max_iters):
        prop = int(rng.integers(n))
        w_cur = weights[cur] if weights[cur] > 0 else 1e-12
        w_prop = weights[prop] if weights[prop] > 0 else 1e-12
        alpha = min(1.0, (w_prop / w_cur))
        
        if rng.random() < alpha:
            cur = prop
            
        if replace:
            samples.append(cur)
        else:
            if cur not in visited:
                samples.append(cur)
                visited.add(cur)
        iters += 1

    # Si on n'a pas réussi à collecter k échantillons distincts, compléter par un tirage pondéré sans replacement parmi les restants
    if (not replace) and (len(samples) < k):
        remaining = [i for i in range(n) if i not in set(samples)]
        if remaining:
            rem_weights = weights[remaining].astype(float)
            rem_weights = np.maximum(rem_weights, 0.0)
            if rem_weights.sum() == 0:
                rem_probs = np.ones(len(remaining)) / len(remaining)
            else:
                rem_probs = rem_weights / rem_weights.sum()
            need = k - len(samples)
            chosen = rng.choice(remaining, size=min(need, len(remaining)), replace=False, p=rem_probs)
            samples.extend(list(chosen))

    
    if len(samples) == 0:
        raise Exception("Aucun échantillon n'a pu être collecté.")
    
    # Return les lignes correspondant aux indices sélectionnés
    return P_df.iloc[samples].reset_index(drop=True)

print('Exemples MCMC léger\n')

res_replace = light_mcmc(P_sampling, k=5, replace=True, random_state=42)

print('Avec remise :\n')
print(res_replace[['antecedents','consequents','final_sampling_weight']])

res_no_replace = light_mcmc(P_sampling, k=5, replace=False, random_state=42)

print('Sans remise (distincts) :\n')
print(res_no_replace[['antecedents','consequents','final_sampling_weight']])

Exemples MCMC léger

Avec remise :

  antecedents consequents  final_sampling_weight
0  (0_beurre)    (0_lait)                   0.56
1  (0_beurre)    (0_lait)                   0.56
2    (0_lait)  (0_beurre)                   0.56
3    (0_lait)  (0_beurre)                   0.56
4    (0_lait)  (0_beurre)                   0.56
Sans remise (distincts) :

  antecedents consequents  final_sampling_weight
0  (0_beurre)    (0_lait)               0.560000
1    (0_lait)  (0_beurre)               0.560000
2    (0_pain)    (0_lait)               0.293333
3  (0_beurre)    (0_pain)               0.160000
4    (0_pain)  (0_beurre)               0.293333


In [10]:
P_sampling

Unnamed: 0,antecedents,consequents,support,confidence,lift,coverage,length,composite_score,feedback_weight,final_sampling_weight
0,(0_pain),(0_lait),0.4,0.666667,0.0,0.6,2,0.293333,1.0,0.293333
1,(0_lait),(0_pain),0.4,0.0,0.0,0.8,2,0.16,1.0,0.16
2,(0_pain),(0_beurre),0.4,0.666667,0.0,0.6,2,0.293333,1.0,0.293333
3,(0_beurre),(0_pain),0.4,0.0,0.0,0.8,2,0.16,1.0,0.16
4,(0_beurre),(0_lait),0.6,1.0,1.0,0.8,2,0.56,1.0,0.56
5,(0_lait),(0_beurre),0.6,1.0,1.0,0.8,2,0.56,1.0,0.56


In [11]:
# Intégrer un mécanisme simple de feedback utilisateur (like/dislike) qui ré-pondère les motifs et permet une boucle interactive.
def _make_key(row):
    return str(row['antecedents']) + '||' + str(row['consequents'])

def apply_like(P_df, idx=None, key=None, factor=1.25):
    # key : clé texte 'antecedents||consequents'
    # ou idx : int
    
    if idx is not None:
        P_df.at[int(idx), 'feedback_weight'] = float(factor)
    elif key is not None:
        mask = (P_df.apply(_make_key, axis=1) == key)
        P_df.loc[mask, 'feedback_weight'] = P_df.loc[mask, 'feedback_weight'] * factor
    P_df['final_sampling_weight'] = P_df['composite_score'] * P_df['feedback_weight']
    return P_df

def apply_dislike(P_df, idx=None, key=None, factor=0.8):
    return apply_like(P_df, idx=idx, key=key, factor=factor)

def reset_feedback(P_df):
    P_df['feedback_weight'] = 1.0
    P_df['final_sampling_weight'] = P_df['composite_score'].copy()
    return P_df

print('Exemple feedback programmatique :')

print('Top avant feedback :')
display(P_sampling.sort_values('final_sampling_weight', ascending=False).head(5)[['antecedents','consequents','final_sampling_weight','feedback_weight']])

# simuler un like sur le motif ayant le plus grand poids actuel
top_idx = P_sampling['final_sampling_weight'].idxmax()
apply_like(P_sampling, idx=top_idx, factor=1.5)

print('\nTop après like multiplicatif sur le top 1 :')
display(P_sampling.sort_values('final_sampling_weight', ascending=False).head(5)[['antecedents','consequents','final_sampling_weight','feedback_weight']])

Exemple feedback programmatique :
Top avant feedback :


Unnamed: 0,antecedents,consequents,final_sampling_weight,feedback_weight
4,(0_beurre),(0_lait),0.56,1.0
5,(0_lait),(0_beurre),0.56,1.0
0,(0_pain),(0_lait),0.293333,1.0
2,(0_pain),(0_beurre),0.293333,1.0
1,(0_lait),(0_pain),0.16,1.0



Top après like multiplicatif sur le top 1 :


Unnamed: 0,antecedents,consequents,final_sampling_weight,feedback_weight
4,(0_beurre),(0_lait),0.84,1.5
5,(0_lait),(0_beurre),0.56,1.0
0,(0_pain),(0_lait),0.293333,1.0
2,(0_pain),(0_beurre),0.293333,1.0
1,(0_lait),(0_pain),0.16,1.0
