# Étape 1 : Prétraitement du Texte

In [1]:
# Charger les données (exemple avec le dataset de Kaggle)

import pandas as pd

df = pd.read_csv('Personal_Finance_Dataset.csv')
df = df.dropna()
df

Unnamed: 0,Date,Transaction Description,Category,Amount,Type
0,2020-01-02,Score each.,Food & Drink,1485.69,Expense
1,2020-01-02,Quality throughout.,Utilities,1475.58,Expense
2,2020-01-04,Instead ahead despite measure ago.,Rent,1185.08,Expense
3,2020-01-05,Information last everything thank serve.,Investment,2291.00,Income
4,2020-01-13,Future choice whatever from.,Food & Drink,1126.88,Expense
...,...,...,...,...,...
1495,2024-12-28,Quite as when.,Rent,514.09,Expense
1496,2024-12-28,Right analysis mention.,Entertainment,727.25,Expense
1497,2024-12-28,No couple debate must.,Investment,1425.00,Income
1498,2024-12-29,Discussion black follow.,Shopping,655.78,Expense


In [2]:
# Encoder les labels textuels en entiers

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['Category'])
num_classes = len(label_encoder.classes_)
df

Unnamed: 0,Date,Transaction Description,Category,Amount,Type,category_encoded
0,2020-01-02,Score each.,Food & Drink,1485.69,Expense,1
1,2020-01-02,Quality throughout.,Utilities,1475.58,Expense,9
2,2020-01-04,Instead ahead despite measure ago.,Rent,1185.08,Expense,5
3,2020-01-05,Information last everything thank serve.,Investment,2291.00,Income,3
4,2020-01-13,Future choice whatever from.,Food & Drink,1126.88,Expense,1
...,...,...,...,...,...,...
1495,2024-12-28,Quite as when.,Rent,514.09,Expense,5
1496,2024-12-28,Right analysis mention.,Entertainment,727.25,Expense,0
1497,2024-12-28,No couple debate must.,Investment,1425.00,Income,3
1498,2024-12-29,Discussion black follow.,Shopping,655.78,Expense,7


In [3]:
# Séparer les données

from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df,
    df['category_encoded'],
    test_size = 0.2,
    random_state = 42
)

train_texts = train_texts["Transaction Description"].fillna("").astype(str)
test_texts = test_texts["Transaction Description"].fillna("").astype(str)

print(f"df shape                     : {df.shape}")
print(f"df['category_encoded'] shape : {df['category_encoded'].shape}")
print(f"train_texts shape            : {train_texts.shape}")
print(f"test_texts shape             : {test_texts.shape}")
print(f"train_labels shape           : {train_labels.shape}")
print(f"test_labels shape            : {test_labels.shape}")

df shape                     : (1500, 6)
df['category_encoded'] shape : (1500,)
train_texts shape            : (1200,)
test_texts shape             : (300,)
train_labels shape           : (1200,)
test_labels shape            : (300,)


In [4]:
# Créer et adapter la couche de vectorisation

from tensorflow.keras import layers

max_features = 10000
sequence_length = 50
vectorize_layer = layers.TextVectorization(
    max_tokens = max_features,
    output_mode = 'int',
    output_sequence_length = sequence_length
)
vectorize_layer

<TextVectorization name=text_vectorization, built=False>

In [5]:
train_texts.info()

<class 'pandas.core.series.Series'>
Index: 1200 entries, 382 to 1126
Series name: Transaction Description
Non-Null Count  Dtype 
--------------  ----- 
1200 non-null   object
dtypes: object(1)
memory usage: 18.8+ KB


In [6]:
# Adapter la couche au vocabulaire des textes d'entraînement

vectorize_layer.adapt(train_texts)
vectorize_layer

<TextVectorization name=text_vectorization, built=False>

# Étape 2 : Construction du Modèle Séquentiel

In [7]:
import tensorflow as tf

embedding_dim = 128

model = tf.keras.Sequential([
    layers.Input(
        shape = (1,),
        dtype = tf.string
    ),
    vectorize_layer,
    layers.Embedding(
        input_dim = max_features + 1,
        output_dim = embedding_dim
    ),
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(64, activation = 'relu'),
    layers.Dropout(0.5),
    layers.Dense(num_classes, activation='softmax')
])
model

<Sequential name=sequential, built=True>

# Étape 3 : Callbacks, Compilation et Entraînement

In [8]:
# Callbacks

nlp_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='best_model_nlp.keras',
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)

nlp_early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

In [9]:
# Compilation

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.summary()

In [13]:
# Entraînement

epochs = 10
history = model.fit(
    train_texts.values.astype(object),
    train_labels.values,
    validation_data = (test_texts.values.astype(object), test_labels.values),
    epochs = epochs,
    callbacks = [nlp_checkpoint_callback, nlp_early_stopping_callback]
)

Epoch 1/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.0960 - loss: 2.3060 - val_accuracy: 0.1267 - val_loss: 2.3022
Epoch 2/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.0991 - loss: 2.3002 - val_accuracy: 0.0900 - val_loss: 2.3082
Epoch 3/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.1233 - loss: 2.2901 - val_accuracy: 0.0767 - val_loss: 2.3057
Epoch 4/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.1644 - loss: 2.2691 - val_accuracy: 0.1100 - val_loss: 2.3054
