## BE Data Science

In [None]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


### Chargement des données brutes en mémoire


In [7]:
import pandas as pd
import csv

def read_ds(ds_name: str):
    rows = []
    
    with open(ds_name, "r", encoding="utf-8") as f:
        lines = f.readlines()
    
    sample_line = lines[0]
    sniffer = csv.Sniffer()
    try:
        dialect = sniffer.sniff(sample_line)
        delim = dialect.delimiter
    except csv.Error:
        delim = ","   
    
    for line in lines:
        line = line.strip()
        if not line:
            continue  
        
        parts = line.split(delim)
        rows.append(parts)
    

    max_len = max(len(r) for r in rows)
    
   
    columns = ["util", "navigateur"] + [f"action_{i}" for i in range(1, max_len - 1)]

    normalized_rows = [r + [None]*(max_len - len(r)) for r in rows]
    
    df = pd.DataFrame(normalized_rows, columns=columns)
    return df



In [8]:
features_train = read_ds("train.csv")
features_test = read_ds("test.csv")
features_train.shape, features_test.shape

((3279, 14470), (324, 7726))

In [9]:
features_train.head()

Unnamed: 0,util,navigateur,action_1,action_2,action_3,action_4,action_5,action_6,action_7,action_8,...,action_14459,action_14460,action_14461,action_14462,action_14463,action_14464,action_14465,action_14466,action_14467,action_14468
0,nuh,Firefox,Création d'un écran(infologic.core.accueil.Acc...,Affichage d'une dialogue,Exécution d'un bouton,Fermeture d'une dialogue,Affichage d'une dialogue,Exécution d'un bouton,Fermeture d'une dialogue,Création d'un écran(infologic.core.gui.control...,...,,,,,,,,,,
1,muz,Google Chrome,Création d'un écran(infologic.core.gui.control...,Création d'un écran(infologic.core.gui.control...,t5,Sélection d’un onglet(infologic.orga.modules.O...,t10,Exécution d'un bouton,t15,Sélection d’un onglet,...,,,,,,,,,,
2,zrx,Microsoft Edge,Affichage d'une dialogue(infologic.core.gui.co...,Exécution d'un bouton,Chainage,Fermeture d'une dialogue,Affichage d'une dialogue(infologic.acti.module...,Clic sur une grille d'historique de recherche,Raccourci,Fermeture d'une dialogue,...,,,,,,,,,,
3,pou,Firefox,Création d'un écran(infologic.core.gui.control...,t5,Exécution d'un bouton(MAINT),Affichage d'une dialogue,Fermeture d'une dialogue,Double-clic,Exécution d'un bouton,Lancement d'une stat(infologic.core.gui.contro...,...,,,,,,,,,,
4,ald,Google Chrome,Affichage d'une dialogue(infologic.acti.module...,t5,Exécution d'un bouton,Fermeture d'une dialogue,t10,Entrée en saisie dans un formulaire,t10,Affichage d'une dialogue,...,,,,,,,,,,


### Première analyse de ces données

In [20]:
features_train.iloc[:, :20].head()

Unnamed: 0,util,navigateur,action_1,action_2,action_3,action_4,action_5,action_6,action_7,action_8,action_9,action_10,action_11,action_12,action_13,action_14,action_15,action_16,action_17,action_18
0,nuh,Firefox,Création d'un écran(infologic.core.accueil.Acc...,Affichage d'une dialogue,Exécution d'un bouton,Fermeture d'une dialogue,Affichage d'une dialogue,Exécution d'un bouton,Fermeture d'une dialogue,Création d'un écran(infologic.core.gui.control...,t5,Exécution d'un bouton(MAINT),Affichage d'une dialogue,Fermeture d'une dialogue,Double-clic,Exécution d'un bouton,Lancement d'une stat(infologic.core.gui.contro...,Affichage d'un toast,Lancement d'une stat(infologic.core.gui.contro...,t10
1,muz,Google Chrome,Création d'un écran(infologic.core.gui.control...,Création d'un écran(infologic.core.gui.control...,t5,Sélection d’un onglet(infologic.orga.modules.O...,t10,Exécution d'un bouton,t15,Sélection d’un onglet,Exécution d'un bouton,Chainage,t20,Sélection d’un onglet(MAJ),t25,Sélection d'un écran(infologic.orga.modules.OR...,Sélection d'un écran,Exécution d'un bouton,Chainage,t30
2,zrx,Microsoft Edge,Affichage d'une dialogue(infologic.core.gui.co...,Exécution d'un bouton,Chainage,Fermeture d'une dialogue,Affichage d'une dialogue(infologic.acti.module...,Clic sur une grille d'historique de recherche,Raccourci,Fermeture d'une dialogue,Création d'un écran,t5,Clic sur une grille d'historique de recherche,Raccourci,Exécution d'un bouton,Affichage d'une dialogue,t10,Clic sur une grille d'historique de recherche,Raccourci,Exécution d'un bouton
3,pou,Firefox,Création d'un écran(infologic.core.gui.control...,t5,Exécution d'un bouton(MAINT),Affichage d'une dialogue,Fermeture d'une dialogue,Double-clic,Exécution d'un bouton,Lancement d'une stat(infologic.core.gui.contro...,t10,Filtrage / Tri(MAINT),Clic sur une grille d'historique de recherche,Sélection d'un flag,Filtrage / Tri,t15,Affichage d'une dialogue,Clic sur une grille d'historique de recherche,Affichage d'une dialogue,t20
4,ald,Google Chrome,Affichage d'une dialogue(infologic.acti.module...,t5,Exécution d'un bouton,Fermeture d'une dialogue,t10,Entrée en saisie dans un formulaire,t10,Affichage d'une dialogue,t15,Fermeture d'une dialogue,t20,Sélection d'un écran(infologic.crm.modules.CRM...,Exécution d'un bouton,Action de table,Affichage d'un toast,Affichage d'une dialogue,Affichage d'une dialogue,Exécution d'un bouton


In [23]:
number_unique_navigators = list(features_train['navigateur'].value_counts())
print(len(number_unique_navigators))

4


Let's now define functions that we might use later to see the graphs.

In [24]:
import warnings
from IPython.display import display, Markdown

# décorateurs utilitaires pour supprimer les avertissements de la sortie et imprimer un cadre de données dans un tableau Markdown.
def ignore_warnings(f):
    def _f(*args, **kwargs):
        warnings.filterwarnings('ignore')
        v = f(*args, **kwargs)
        warnings.filterwarnings('default')
        return v
    return _f

# affiche un DataFrame Pandas sous forme de tableau Markdown dans un notebook Jupyter.
def markdown_table(headNtail=False, use_index=True, title=None, precision=2):
    def _get_value(val): return str(round(val, precision) if isinstance(val, float) else val)
    def _format_row(row): 
        row_str = ""
        if use_index: row_str += f"|{str(row.name)}"
        for value in row.values: row_str += f"| {_get_value(value)}"
        return row_str + "|"
    def _get_str(df):
        return "\n".join(df.apply(_format_row, axis=1))
    def _deco(f):
        def _f(*args, **kwargs):
            df = f(*args, **kwargs)
            _str = f"#### {title}\n" if title else ""
            header = ([str(df.index.name)] if use_index else []) + df.columns.astype(str).to_list() 
            _str += f"|{'|'.join(header)}|" + f"\n|{'--|'*len(header)}\n" if header else None
            if headNtail:
                _str += _get_str(df.head())
                _str += "\n|...|...|\n"
                _str += _get_str(df.tail())
            else:
                _str += _get_str(df)
            display(Markdown(_str))
        return _f
    return _deco

# fonction utilitaire permettant d'obtenir une grille graphique à partir d'un nombre arbitraire de lignes/colonnes ou de données.
def get_grid(n, n_row=None, n_col=None, titles=None, figsize=(10, 8), wspace=.5, hspace=.5, **kwargs):
    if n_row: n_col= n_col or math.floor(n/n_row)
    elif n_col: n_row= n_row or math.ceil(n/n_col)
    else:
        n_row = math.ceil(math.sqrt(n))
        n_col = math.floor(n/n_row)
    fig, axs = plt.subplots(n_row, n_col, figsize=figsize, **kwargs)
    plt.subplots_adjust(hspace=hspace, wspace=wspace)
    if titles is not None:
        for ax, title in zip(axs.flat, titles): ax.set_title(title)
    return fig, axs

In [29]:
features_train["util"]


0       nuh
1       muz
2       zrx
3       pou
4       ald
       ... 
3274    muz
3275    cjr
3276    fuz
3277    cjr
3278    fwf
Name: util, Length: 3279, dtype: object

In [33]:
user_nav_grouped = features_train.groupby("util")["navigateur"].apply(list)
print(user_nav_grouped)

util
aho    [Google Chrome, Google Chrome, Google Chrome, ...
ajo    [Firefox, Firefox, Firefox, Firefox, Firefox, ...
akx    [Google Chrome, Google Chrome, Google Chrome, ...
ald    [Google Chrome, Google Chrome, Google Chrome, ...
ats    [Firefox, Firefox, Firefox, Firefox, Firefox, ...
                             ...                        
zqs    [Microsoft Edge, Microsoft Edge, Microsoft Edg...
zro    [Firefox, Firefox, Firefox, Firefox, Firefox, ...
zrx    [Microsoft Edge, Microsoft Edge, Microsoft Edg...
zus    [Firefox, Firefox, Firefox, Firefox, Firefox, ...
zyk    [Google Chrome, Google Chrome, Google Chrome, ...
Name: navigateur, Length: 247, dtype: object


In [41]:
@markdown_table(title="Navigateur par utilisateur", headNtail=True)
def browsers_per_player(df):
    user_nav_grouped = df.groupby("util")["navigateur"].apply(list).reset_index()
    return user_nav_grouped

browser_user_df = browsers_per_player(features_train)



#### Navigateur par utilisateur
|None|util|navigateur|
|--|--|--|
|0| aho| ['Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome']|
|1| ajo| ['Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox']|
|2| akx| ['Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome']|
|3| ald| ['Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome']|
|4| ats| ['Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox']|
|...|...|
|242| zqs| ['Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge']|
|243| zro| ['Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox']|
|244| zrx| ['Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge', 'Microsoft Edge']|
|245| zus| ['Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox', 'Firefox']|
|246| zyk| ['Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome', 'Google Chrome']|

We can observe that the user is always using the same browser to do the actions. Let's see if some users still used several browsers.

In [44]:
user_nav_grouped = features_train.groupby("util")["navigateur"].apply(list).reset_index()


user_nav_grouped["unique_browsers_count"] = user_nav_grouped["navigateur"].apply(lambda x: len(set(x)))
filtered_users = user_nav_grouped[user_nav_grouped["unique_browsers_count"] > 1]


filtered_users = filtered_users[["util", "navigateur"]]

filtered_users.head()

Unnamed: 0,util,navigateur


There is absolutely no user that is using more than one browser. The users can be classified in different categories by the browsers they use. If this is then linked to any activity we can train on the dataset on those features.

In [48]:
@markdown_table(headNtail=True, title="Stats: Y distribution")
def get_Y_stats(df,key):
    y_counts = df[key].value_counts().reset_index()
    y_counts.columns = ["Y_value", "count"]
    return y_counts

get_Y_stats(features_train,"util")
get_Y_stats(features_train,"navigateur")


#### Stats: Y distribution
|None|Y_value|count|
|--|--|--|
|0| skm| 75|
|1| slq| 71|
|2| cjr| 46|
|3| flj| 42|
|4| hjs| 37|
|...|...|
|242| rff| 4|
|243| bez| 4|
|244| crn| 4|
|245| azn| 4|
|246| fyg| 4|

#### Stats: Y distribution
|None|Y_value|count|
|--|--|--|
|0| Firefox| 1466|
|1| Google Chrome| 1339|
|2| Microsoft Edge| 451|
|3| Opera| 23|
|...|...|
|0| Firefox| 1466|
|1| Google Chrome| 1339|
|2| Microsoft Edge| 451|
|3| Opera| 23|

Now we will summarize all the statistics that we can retrieve from our dataset in a table. 
Per user, per browser, we have:
- nb_actions = number of actions for a given user session
- temps_total = total duration of a given user session
- vitesse = average time per action
- ecart_type_temps = variable used to supplement speed information by analyzing the variability of time between actions
- taux_repetition = proportion of consecutive identical actions within the session. (we compare each action with the previous one, if it's the same we add one and then we divde the score by the number of actions)
- longueur_moyenne_sequence: Average length of consecutive identical action sequences.
- diversite_actions = Ratio of unique action types to total actions (measures behavioral variety).
- action_plus_frequente = The most frequently occurring action type in the session.
- %_Affichage = Percentage of actions related to “Affichage” (display/show events).
- %_Exécution = Percentage of actions related to “Exécution” (button executions or clicks).
- %_Fermeture = Percentage of actions related to “Fermeture” (closing dialogs or windows).
- %_Création = Percentage of actions related to “Création” (creating or opening new screens).
- %_Autres = Percentage of actions not included in the main categories above

In [34]:
def analyser_session_complete(session_str):
    parts = session_str.split(',')
    if len(parts) < 3:
        return None
    
    utilisateur = parts[0]
    navigateur = parts[1]
    actions = parts[2:]

    current_time = 0
    timeline = []

    for elem in actions:
        match = re.match(r't(\d+)', elem.strip())
        if match:
            current_time = int(match.group(1))
        else:
            timeline.append((current_time, elem.strip()))

    df = pd.DataFrame(timeline, columns=["temps", "action"])
    if df.empty:
        return None

    #Nettoyage et extraction de type d’action
    df["action_type"] = df["action"].apply(lambda x: x.split("(")[0].strip())

    #Statistiques de base
    nb_actions = len(df)
    temps_total = df["temps"].max() - df["temps"].min() if nb_actions > 1 else 0

    df["delta"] = df["temps"].diff().fillna(0)
    vitesse = temps_total / nb_actions if nb_actions > 1 else 0
    ecart_type_temps = df["delta"].std()

    #Répétitions et diversités des actions
    diversite_actions = df["action_type"].nunique() / nb_actions
    action_plus_frequente = df["action_type"].value_counts().idxmax()

    df["repetition"] = (df["action_type"] == df["action_type"].shift(1)).astype(int)
    taux_repetition = df["repetition"].mean()

    sequences = []
    current_seq = 1
    for i in range(1, len(df)):
        if df["action_type"].iloc[i] == df["action_type"].iloc[i - 1]:
            current_seq += 1
        else:
            sequences.append(current_seq)
            current_seq = 1
    sequences.append(current_seq)
    longueur_moyenne_sequence = sum(sequences) / len(sequences) if sequences else 0

    #Classification des types d'actions
    def freq(motcle):
        return (df["action_type"].str.contains(motcle, case=False, regex=False)).mean()

    freq_affichage = freq("Affichage")
    freq_execution = freq("Exécution")
    freq_fermeture = freq("Fermeture")
    freq_creation = freq("Création")
    freq_autres = 1 - (freq_affichage + freq_execution + freq_fermeture + freq_creation)

    return {
        "utilisateur": utilisateur,
        "navigateur": navigateur,
        "nb_actions": nb_actions,
        "temps_total": temps_total,
        "vitesse": round(vitesse, 2),
        "ecart_type_temps": round(ecart_type_temps, 2),
        "taux_repetition": round(taux_repetition, 3),
        "longueur_moyenne_sequence": round(longueur_moyenne_sequence, 2),
        "diversite_actions": round(diversite_actions, 2),
        "action_plus_frequente": action_plus_frequente,
        "%_Affichage": round(freq_affichage, 2),
        "%_Exécution": round(freq_execution, 2),
        "%_Fermeture": round(freq_fermeture, 2),
        "%_Création": round(freq_creation, 2),
        "%_Autres": round(freq_autres, 2)
    }

# --- Affichage ---
pd.options.display.float_format = '{:.2f}'.format
display(df_stats)


Unnamed: 0,utilisateur,navigateur,nb_actions,temps_total,vitesse,ecart_type_temps,taux_repetition,longueur_moyenne_sequence,diversite_actions,action_plus_frequente,%_Affichage,%_Exécution,%_Fermeture,%_Création,%_Autres
0,nuh,Firefox,6,10,1.67,2.58,0.0,1.0,0.83,Affichage d'une dialogue,0.33,0.17,0.33,0.17,0.0
1,abc,Chrome,4,20,5.0,5.77,0.0,1.0,1.0,Affichage d'un écran,0.5,0.25,0.25,0.0,0.0
2,xyz,Edge,4,24,6.0,4.0,0.0,1.0,1.0,Affichage d'une fenêtre,0.5,0.25,0.25,0.0,0.0
3,jcp,Firefox,3,8,2.67,2.31,0.0,1.0,1.0,Affichage d'un écran,0.33,0.0,0.33,0.0,0.33
4,mld,Chrome,4,9,2.25,1.5,0.0,1.0,1.0,Création d'un écran,0.25,0.25,0.25,0.25,0.0
5,kty,Edge,3,4,1.33,1.15,0.0,1.0,1.0,Affichage d'une boîte,0.33,0.0,0.67,0.0,0.0


### Construction de caractéristiques

#### Traitements préliminaires

Let's inspect if there is any outliers in our dataset. To do so, we need to check the occurences of actions and remove the ones that occur the less. 

We first check what are the actions and we want to see if there are any actions that are out of the formal ones. 

In [58]:
def get_action_counts(df):
    action_cols = [col for col in df.columns if col.startswith("action_")]
    all_actions = df[action_cols].values.ravel()
    all_actions_series = pd.Series(all_actions).dropna()
    action_counts = all_actions_series.value_counts()
    
    return action_counts

action_counts = get_action_counts(features_train)

action_counts_df = action_counts.reset_index()
action_counts_df.columns = ["action", "count"]
action_counts_df.head()



Unnamed: 0,action,count
0,Exécution d'un bouton,265323
1,Saisie dans un champ,127325
2,Lancement d'une action générique,115069
3,Fermeture d'une dialogue,90338
4,Exécution d'un bouton1,89331


In [57]:
action_counts_df.tail()

Unnamed: 0,action,count
9440,Sélection d'un écran<DEF_03/24>$MAJ$,1
9441,Filtrage / Tri(infologic.acti.modules.AT_ACTIV...,1
9442,Dissimulation d'une arborescence(infologic.cor...,1
9443,Exécution d'un bouton<Défaut>1,1
9444,Chainage(MAINT)$EDI$,1


We can see that there are actions that occure only 1 time which is a very odd data and we don't want to train on those values. We are opting for a frequency-based filtering approach. We chose to keep 99% of the total informations in the df. 

In [64]:
p = 0.99
def filter_actions(df,list_actions,p):
    action_cols = [col for col in df.columns if col.startswith("action_")]
    cumulative_pct = (list_actions.cumsum() / list_actions.sum())
    actions_to_keep = cumulative_pct[cumulative_pct <= p].index

    print(f"Keeping {len(actions_to_keep)} actions out of {len(list_actions)} "
          f"({100 * len(actions_to_keep) / len(list_actions):.2f}% of unique actions).")

    df_filtered = df.copy()
    df_filtered[action_cols] = df_filtered[action_cols].applymap(
        lambda x: x if x in actions_to_keep else None
    )

    return df_filtered, actions_to_keep


filtered_train_data, actions_to_keep = filter_actions(features_train,action_counts,0.99)


Keeping 3124 actions out of 9445 (33.08% of unique actions).


In [45]:
# ============================================================
#  PREDICTION D'UTILISATEUR À PARTIR DE SESSIONS D'ACTIONS
#  ---- version paramétrable pour tests rapides ----
# ============================================================

import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# ============================================================
# ⚙️  SECTION PARAMÈTRES DE TEST
# ============================================================

CONFIG = {
    "hidden_layers": [50, 60, 40, 56, 32],  # taille de chaque couche cachée
    "activation": "sigmoid",                # fonction d’activation : sigmoid, relu, tanh...
    "dropout_rate": 0.3,                    # taux de Dropout sur chaque couche (0 = désactivé)
    "epochs": 50,                           # nombre d’itérations d’apprentissage
    "batch_size": 16,                       # taille du batch
    "optimizer": "adam",                    # optimiseur (adam, sgd, rmsprop, etc.)
    "loss": "categorical_crossentropy",     # fonction de perte
    "test_size": 0.2,                       # proportion validation
    "random_state": 42,                     # reproductibilité
}

# ============================================================
# 1️⃣  Fonction : extraction des statistiques d'une session
# ============================================================
def analyser_session_complete(session_str):
    parts = session_str.split(',')
    utilisateur = parts[0]
    navigateur = parts[1]
    actions = parts[2:]

    current_time = 0
    timeline = []

    for elem in actions:
        match = re.match(r't(\d+)', elem.strip())
        if match:
            current_time = int(match.group(1))
        else:
            timeline.append((current_time, elem.strip()))

    df = pd.DataFrame(timeline, columns=["temps", "action"])
    if df.empty:
        return None

    # --- Calculs temporels ---
    df["delta"] = df["temps"].diff().fillna(0)
    moyenne_reelle = df["delta"].mean()
    ecart_type_temps = df["delta"].std()
    nb_actions = len(df)
    temps_total = df["temps"].iloc[-1] - df["temps"].iloc[0] if len(df) > 1 else 0
    moyenne_globale = temps_total / nb_actions if nb_actions > 0 else 0

    # --- Diversité ---
    nb_actions_uniques = df["action"].nunique()
    diversite_actions = nb_actions_uniques / nb_actions if nb_actions > 0 else 0

    # --- Répétitions ---
    df["repetition"] = (df["action"] == df["action"].shift(1)).astype(int)
    taux_repetition = df["repetition"].mean()

    sequences = []
    current_seq = 1
    for i in range(1, len(df)):
        if df["action"].iloc[i] == df["action"].iloc[i - 1]:
            current_seq += 1
        else:
            sequences.append(current_seq)
            current_seq = 1
    sequences.append(current_seq)
    longueur_moyenne_sequence = np.mean(sequences)

    return {
        "utilisateur": utilisateur,
        "navigateur": navigateur,
        "moyenne_globale": moyenne_globale,
        "moyenne_reelle": moyenne_reelle,
        "ecart_type_temps": ecart_type_temps,
        "nb_actions": nb_actions,
        "nb_actions_uniques": nb_actions_uniques,
        "diversite_actions": diversite_actions,
        "taux_repetition": taux_repetition,
        "longueur_moyenne_sequence": longueur_moyenne_sequence,
    }

# ============================================================
# 2️⃣  Chargement et prétraitement des données
# ============================================================
train_df = read_ds("train.csv")
test_df = read_ds("test.csv")

train_stats = [analyser_session_complete(s) for s in train_df["session"] if s is not None]
test_stats = [analyser_session_complete(s) for s in test_df["session"] if s is not None]

train_features = pd.DataFrame(train_stats)
test_features = pd.DataFrame(test_stats)

X = train_features.drop(columns=["utilisateur", "navigateur"])
y = train_features["utilisateur"]

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test_features.drop(columns=["utilisateur", "navigateur"], errors='ignore'))

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_categorical, test_size=CONFIG["test_size"], random_state=CONFIG["random_state"]
)

# ============================================================
# 3️⃣  Construction dynamique du modèle
# ============================================================
model = Sequential()

# Couche d’entrée
model.add(Dense(CONFIG["hidden_layers"][0], activation=CONFIG["activation"], input_shape=(X_train.shape[1],)))
if CONFIG["dropout_rate"] > 0:
    model.add(Dropout(CONFIG["dropout_rate"]))

# Couches cachées restantes
for neurons in CONFIG["hidden_layers"][1:]:
    model.add(Dense(neurons, activation=CONFIG["activation"]))
    if CONFIG["dropout_rate"] > 0:
        model.add(Dropout(CONFIG["dropout_rate"]))

# Couche de sortie
model.add(Dense(y_categorical.shape[1], activation='softmax'))

model.compile(optimizer=CONFIG["optimizer"], loss=CONFIG["loss"], metrics=['accuracy'])
model.summary()

# ============================================================
# 4️⃣  Entraînement + suivi du F1-score
# ============================================================
epochs = CONFIG["epochs"]
f1_train_scores = []
f1_val_scores = []

for epoch in range(epochs):
    model.fit(X_train, y_train, epochs=1, batch_size=CONFIG["batch_size"], verbose=0)
    
    y_train_pred = model.predict(X_train, verbose=0)
    y_val_pred = model.predict(X_val, verbose=0)
    
    y_train_classes = np.argmax(y_train_pred, axis=1)
    y_val_classes = np.argmax(y_val_pred, axis=1)
    y_train_true = np.argmax(y_train, axis=1)
    y_val_true = np.argmax(y_val, axis=1)
    
    f1_train = f1_score(y_train_true, y_train_classes, average='macro')
    f1_val = f1_score(y_val_true, y_val_classes, average='macro')
    
    f1_train_scores.append(f1_train)
    f1_val_scores.append(f1_val)
    
    print(f"Epoch {epoch+1:02d} — F1 train: {f1_train:.3f} | F1 val: {f1_val:.3f}")

# ============================================================
# 5️⃣  Visualisation F1-score
# ============================================================
plt.figure(figsize=(8, 5))
plt.plot(range(1, epochs + 1), f1_train_scores, label='Train F1-score')
plt.plot(range(1, epochs + 1), f1_val_scores, label='Validation F1-score')
plt.xlabel("Epochs")
plt.ylabel("F1-score (macro)")
plt.title("Evolution of F1-score during training")
plt.legend()
plt.grid(True)
plt.show()

# ============================================================
# 6️⃣  Rapport et prédiction finale
# ============================================================
print("\nFinal Classification Report (Validation):")
print(classification_report(y_val_true, y_val_classes, target_names=encoder.classes_))

predictions = model.predict(X_test_scaled)
predicted_users = encoder.inverse_transform(np.argmax(predictions, axis=1))
test_features["utilisateur_predit"] = predicted_users

print("\n✅ Prédictions sur le fichier test :")
display(test_features[["navigateur", "utilisateur_predit"] + list(X.columns)])


KeyError: 'session'

Model 2: MLP

In [None]:
# ============================
# CONFIG 
# ============================
CONFIG = {
    # model architecture: list = number of neurons per hidden layer
    "hidden_layers": [512, 256, 128, 64],  #En réaugmentant à la fin on obtient de moins bons résultats, finir à 32 aussi
    "activation": "tanh",                   #can test relu (10x less performant in the tests), tanh meilleur que sig
    "dropout_rate": 0.2, #0.2 better than 0.3
    "epochs": 70, # avec tanh on voit une tangente à environ 65
    "batch_size": 32,
    "optimizer": "adam", #weights optimisation
    "loss": "categorical_crossentropy",
    "test_size": 0.1,
    "random_state": 42,
    # feature engineering
    "top_k_actions": 50,          
    "use_browser_ohe": True,         # one-hot encode browser
    # file names
    "train_file": "train.csv",
    "test_file": "test.csv",
    "submission_file": "submission.csv",
    # saving
    "save_model_path": "best_model.h5"
}

# ============================
# Imports
# ============================
import re, os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

# ============================
# Utilities: parse & clean
# ============================
def read_ds(ds_name: str, has_label=True):
    sessions = []
    if not os.path.exists(ds_name):
        raise FileNotFoundError(f"{ds_name} not found")
    with open(ds_name, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line: 
                continue
            if (line.startswith('"') and line.endswith('"')) or (line.startswith("'") and line.endswith("'")):
                line = line[1:-1]
            parts = line.split(',')
            if has_label:
                util = parts[0]
                browser = parts[1] if len(parts) > 1 else ""
                actions = parts[2:]
            else:
                util = None
                browser = parts[0] if len(parts) > 1 else ""
                actions = parts[1:] if len(parts) > 1 else parts
            sessions.append({"util": util, "browser": browser, "raw": line, "parts": actions})
    return pd.DataFrame(sessions)

def filter_action(value: str):
    if not isinstance(value, str):
        return ""
    if value.startswith("t") and value[1:].isdigit():
        return ""
    for delim in ["(", "<", "$", "1"]:
        if delim in value:
            idx = value.find(delim)
            if idx > 0:
                value = value[:idx]
    return value.strip()

# ============================
# Feature builder
# ============================
def session_to_features(parts):
    times, actions_clean, timeline = [], [], []
    current_time = 0
    for token in parts:
        tok = token.strip()
        m = re.match(r'^t(\d+)$', tok)
        if m:
            current_time = int(m.group(1))
            continue
        timeline.append((current_time, tok))
        a = filter_action(tok)
        if a:
            actions_clean.append(a)
    if not timeline:
        return {
            "nb_actions": 0, "nb_actions_unique": 0, "temps_total": 0.0,
            "moyenne_delta": 0.0, "std_delta": 0.0, "taux_repetition": 0.0,
            "longueur_moyenne_sequence": 0.0, "diversite_actions": 0.0,
            "actions_joined": ""
        }
    times = [t for t, _ in timeline]
    deltas = [0.0] + [times[i]-times[i-1] for i in range(1,len(times))]
    nb_actions = len(timeline)
    nb_actions_unique = len(set(actions_clean))
    temps_total = float(times[-1]-times[0]) if len(times)>1 else 0.0
    moyenne_delta = float(np.mean(deltas))
    std_delta = float(np.std(deltas, ddof=1)) if len(deltas)>1 else 0.0
    rep_flags = [1 if filter_action(timeline[i][1])==filter_action(timeline[i-1][1]) else 0 for i in range(1,len(timeline))]
    taux_repetition = float(np.mean(rep_flags)) if rep_flags else 0.0
    seqs, cur = [], 1
    for i in range(1,len(actions_clean)):
        if actions_clean[i]==actions_clean[i-1]: cur+=1
        else: seqs.append(cur); cur=1
    if actions_clean: seqs.append(cur)
    longueur_moy_seq = float(np.mean(seqs)) if seqs else 0.0
    diversite = float(nb_actions_unique/nb_actions) if nb_actions>0 else 0.0
    return {
        "nb_actions": nb_actions, "nb_actions_unique": nb_actions_unique, "temps_total": temps_total,
        "moyenne_delta": moyenne_delta, "std_delta": std_delta, "taux_repetition": taux_repetition,
        "longueur_moyenne_sequence": longueur_moy_seq, "diversite_actions": diversite,
        "actions_joined": " ".join([a.replace(" ","_") for a in actions_clean])
    }

def build_features_df(df_in, is_train=True):
    rows = []
    for _, row in df_in.iterrows():
        feats = session_to_features(row["parts"])
        out = feats.copy()
        out["util"] = row["util"] if is_train else None
        out["browser"] = row["browser"]
        out["raw"] = row["raw"]
        rows.append(out)
    return pd.DataFrame(rows)

# ============================
# Main pipeline
# ============================
def main_pipeline(config=CONFIG):
    print("Loading data...")
    train_raw = read_ds(config["train_file"], True)
    test_raw = read_ds(config["test_file"], False)

    print("Building basic features...")
    train_basic = build_features_df(train_raw)
    test_basic = build_features_df(test_raw)

    train_basic = train_basic[train_basic["nb_actions"]>0].reset_index(drop=True)
    test_basic = test_basic[test_basic["nb_actions"]>0].reset_index(drop=True)
    print(f"Train: {len(train_basic)}, Test: {len(test_basic)}")

    # Actions vectorization
    vectorizer = CountVectorizer(max_features=config["top_k_actions"])
    vectorizer.fit(train_basic["actions_joined"].fillna(""))
    X_actions_train = vectorizer.transform(train_basic["actions_joined"].fillna("")).toarray()
    X_actions_test = vectorizer.transform(test_basic["actions_joined"].fillna("")).toarray()

    # Numeric features
    numeric_cols = ["nb_actions", "nb_actions_unique", "temps_total", "moyenne_delta",
                    "std_delta", "taux_repetition", "longueur_moyenne_sequence", "diversite_actions"]
    scaler = StandardScaler()
    X_num_train = scaler.fit_transform(train_basic[numeric_cols].fillna(0).values)
    X_num_test = scaler.transform(test_basic[numeric_cols].fillna(0).values)

    # Browser encoding
    if config["use_browser_ohe"]:
        ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        X_browser_train = ohe.fit_transform(train_basic[["browser"]].fillna("UNK"))
        X_browser_test = ohe.transform(test_basic[["browser"]].fillna("UNK"))
    else:
        X_browser_train = np.zeros((len(train_basic),0))
        X_browser_test = np.zeros((len(test_basic),0))

    # Combine features
    X_train_full = np.hstack([X_num_train, X_browser_train, X_actions_train])
    X_test_full = np.hstack([X_num_test, X_browser_test, X_actions_test])

    # Labels
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(train_basic["util"].astype(str))
    n_classes = len(label_encoder.classes_)
    print(f"Number of classes: {n_classes}")

    # Train/Val split
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train_full, y_train, test_size=config["test_size"], random_state=config["random_state"],
        stratify=y_train if len(np.unique(y_train))>1 else None
    )

    # Build model
    tf.keras.backend.clear_session()
    model = Sequential()
    input_dim = X_tr.shape[1]
    if not config["hidden_layers"]:
        model.add(Dense(n_classes, activation='softmax', input_shape=(input_dim,)))
    else:
        model.add(Dense(config["hidden_layers"][0], activation=config["activation"], input_shape=(input_dim,)))
        if config["dropout_rate"]>0: model.add(Dropout(config["dropout_rate"]))
        for neurons in config["hidden_layers"][1:]:
            model.add(Dense(neurons, activation=config["activation"]))
            if config["dropout_rate"]>0: model.add(Dropout(config["dropout_rate"]))
        model.add(Dense(247, activation='softmax'))  #247 different users

    model.compile(optimizer=config["optimizer"], loss=config["loss"], metrics=['accuracy'])
    model.summary()

    # Training
    checkpoint = ModelCheckpoint(config["save_model_path"], monitor='val_loss', save_best_only=True, verbose=1)
    f1_tr_list, f1_val_list = [], []
    best_f1_val, best_epoch = -1, -1

    for epoch in range(config["epochs"]):
        model.fit(X_tr, tf.keras.utils.to_categorical(y_tr, n_classes), epochs=1,
                  batch_size=config["batch_size"],
                  validation_data=(X_val, tf.keras.utils.to_categorical(y_val, n_classes)),
                  callbacks=[checkpoint], verbose=0)

        y_pred_tr = np.argmax(model.predict(X_tr, verbose=0), axis=1)
        y_pred_val = np.argmax(model.predict(X_val, verbose=0), axis=1)

        f1_tr = f1_score(y_tr, y_pred_tr, average='macro', zero_division=0)
        f1_val = f1_score(y_val, y_pred_val, average='macro', zero_division=0)
        f1_tr_list.append(f1_tr)
        f1_val_list.append(f1_val)

        print(f"Epoch {epoch+1:02d}/{config['epochs']} — F1 train: {f1_tr:.4f} | F1 val: {f1_val:.4f}")

        if f1_val>best_f1_val:
            best_f1_val, best_epoch = f1_val, epoch+1
            model.save(config["save_model_path"])
            print(f"  -> New best F1 val {best_f1_val:.4f}, model saved.")

    # Plot F1
    plt.figure(figsize=(8,5))
    plt.plot(range(1, config["epochs"]+1), f1_tr_list, label="F1 train")
    plt.plot(range(1, config["epochs"]+1), f1_val_list, label="F1 val")
    plt.xlabel("Epoch"); plt.ylabel("F1 (macro)"); plt.legend(); plt.grid(True)
    plt.show()

    # Classification report
    y_val_pred_final = np.argmax(model.predict(X_val, verbose=0), axis=1)
    val_classes = np.unique(y_val)
    print("\nClassification report (validation):")
    print(classification_report(
        y_val,
        y_val_pred_final,
        labels=val_classes,
        target_names=[label_encoder.classes_[i] for i in val_classes],
        zero_division=0
    ))

    # Predict on test set
    best_model = tf.keras.models.load_model(config["save_model_path"])
    y_test_pred_proba = best_model.predict(X_test_full, verbose=0)
    y_test_pred = np.argmax(y_test_pred_proba, axis=1)
    y_test_labels = label_encoder.inverse_transform(y_test_pred)

    df_sub = pd.DataFrame({"prediction": y_test_labels})
    df_sub.index = df_sub.index+1
    df_sub.to_csv(config["submission_file"], index_label="RowId")
    print(f"Submission saved to {config['submission_file']} ({len(df_sub)} rows)")

    return {
        "model": best_model,
        "label_encoder": label_encoder,
        "vectorizer": vectorizer,
        "scaler": scaler,
        "ohe_browser": (ohe if config["use_browser_ohe"] else None),
        "train_basic": train_basic,
        "test_basic": test_basic,
        "history_f1": (f1_tr_list, f1_val_list)
    }

# Run pipeline
if __name__ == "__main__":
    artifacts = main_pipeline(CONFIG)