# Test implémentation de FastText avec Pytorch

In [1]:
import sys
sys.path.append("../")
sys.path.append("../src/")

## Chargement des données

In [2]:
import pandas as pd
df_orig = pd.read_parquet("../data/extraction_sirene_20220712.parquet", engine="pyarrow")

In [3]:
distinct_values = df_orig["APE_SICORE"].unique().tolist()
y_dict = {key: value for key, value in zip(distinct_values, range(len(distinct_values)))}

In [4]:
import pandas as pd
import yaml

df = df_orig.sample(frac=0.0001, random_state=1)

In [5]:
df.shape

(1087, 20)

## Pytorch

In [6]:
from pytorch_classifier.pytorch_trainer import PytorchTrainer

# Trainer module
trainer = PytorchTrainer()

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
categorical_features = ['AUTO', 'NAT_SICORE', 'SURF', 'EVT_SICORE']

In [8]:
LEARNING_RATE = 3e-4
PATIENCE = 5
NUM_EPOCHS = 3

params = {
    "num_epochs": NUM_EPOCHS,
    "patience": PATIENCE,
    "train_proportion": 0.8,
    "batch_size": 64,
    "learning_rate": LEARNING_RATE,
    "buckets": 2000000,
    "embedding_dim": 120,
    "min_count": 3,
    "min_n": 3,
    "max_n": 4,
    "word_ngrams": 3
}

In [9]:
from pytorch_classifier.pytorch_preprocessor import PytorchPreprocessor
from src.constants import TEXT_FEATURE, Y

pytorch_preprocessor = PytorchPreprocessor()
df_train_py, df_test_py, df_gu_py = pytorch_preprocessor.preprocess(
    df=df,
    y=Y,
    text_feature=TEXT_FEATURE,
    categorical_features=categorical_features,
)

In [10]:
df_train_py.head()

Unnamed: 0_level_0,LIB_SICORE,AUTO,NAT_SICORE,SURF,EVT_SICORE,APE_NIV1,APE_NIV2,APE_NIV3,APE_NIV4,APE_NIV5
LIA_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C14017958194,vent voitur occas produit diver march,2,0,0,7,G,45,451,4511,44
X54014003089,apicultur,9,1,0,0,A,1,14,149,149
G69016531995,support patrimoin familial immobili san activi...,2,1,0,5,L,68,683,6832,78
C69018478430,commerc produit alimentair,2,0,0,6,G,47,471,4711,119
G45017062449,construct immeubl vu vent,2,1,0,5,F,41,411,4110,269


In [11]:
pytorch_classifier = trainer.train(df_train_py, Y, TEXT_FEATURE, categorical_features, params)

11it [00:08,  1.27it/s]0:00<?, ?it/s]


Moving GPU: 0.00013875961303710938
Forward: 0.5406155586242676
Backward: 1.2059204578399658
Update: 6.410187482833862


3it [00:00, 23.93it/s]
 33%|███▎      | 1/3 [00:08<00:17,  8.77s/it]

Epoch: 1 | train_loss: 6.95264, val_loss: 6.68026, lr: 3.00E-04, _patience: 5


11it [00:08,  1.31it/s]


Moving GPU: 0.00014090538024902344
Forward: 0.5402317047119141
Backward: 1.234227180480957
Update: 6.138352870941162


3it [00:00, 32.03it/s]
 67%|██████▋   | 2/3 [00:17<00:08,  8.61s/it]

Epoch: 2 | train_loss: 6.40798, val_loss: 6.25496, lr: 3.00E-04, _patience: 5


11it [00:08,  1.27it/s]


Moving GPU: 0.00016045570373535156
Forward: 0.5387780666351318
Backward: 1.2633168697357178
Update: 6.2366578578948975


3it [00:00, 18.43it/s]
100%|██████████| 3/3 [00:26<00:00,  8.70s/it]


Epoch: 3 | train_loss: 5.94232, val_loss: 5.89269, lr: 3.00E-04, _patience: 5


In [12]:
from pytorch_classifier.pytorch_evaluator import PytorchEvaluator

evaluator = PytorchEvaluator(model=pytorch_classifier, tokenizer=trainer.tokenizer)

In [13]:
evaluation = evaluator.evaluate(
    df_test_py, Y, TEXT_FEATURE, categorical_features, 5
)

In [14]:
evaluation

{'accuracy_level_1': 0.08755760368663594,
 'accuracy_level_2': 0.059907834101382486,
 'accuracy_level_3': 0.03225806451612903,
 'accuracy_level_4': 0.027649769585253458,
 'accuracy_level_5': 0.0}