# Test implémentation de FastText avec Pytorch

In [1]:
import sys
sys.path.append("../")
sys.path.append("../src/")

## Chargement des données

In [2]:
import pandas as pd
df_orig = pd.read_parquet("../data/extraction_sirene_20220712.parquet", engine="pyarrow")

In [3]:
distinct_values = df_orig["APE_SICORE"].unique().tolist()
y_dict = {key: value for key, value in zip(distinct_values, range(len(distinct_values)))}

In [4]:
import pandas as pd
import yaml

df = df_orig.sample(frac=0.0001, random_state=1)

In [5]:
df.shape

(1087, 20)

## Pytorch

In [6]:
from pytorch_classifier.pytorch_trainer import PytorchTrainer

# Trainer module
trainer = PytorchTrainer()

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
categorical_features = ['AUTO', 'NAT_SICORE', 'SURF', 'EVT_SICORE']

In [8]:
LEARNING_RATE = 3e-4
PATIENCE = 5
NUM_EPOCHS = 3

params = {
    "num_epochs": NUM_EPOCHS,
    "patience": PATIENCE,
    "train_proportion": 0.8,
    "batch_size": 64,
    "learning_rate": LEARNING_RATE,
    "buckets": 2000000,
    "embedding_dim": 120,
    "min_count": 3,
    "min_n": 3,
    "max_n": 4,
    "word_ngrams": 3,
    "sparse": True
}

In [9]:
from pytorch_classifier.pytorch_preprocessor import PytorchPreprocessor
from src.constants import TEXT_FEATURE, Y

pytorch_preprocessor = PytorchPreprocessor()
df_train_py, df_test_py, df_gu_py = pytorch_preprocessor.preprocess(
    df=df,
    y=Y,
    text_feature=TEXT_FEATURE,
    categorical_features=categorical_features,
)

In [10]:
df_train_py.head()

Unnamed: 0_level_0,LIB_SICORE,AUTO,NAT_SICORE,SURF,EVT_SICORE,APE_NIV1,APE_NIV2,APE_NIV3,APE_NIV4,APE_NIV5
LIA_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C14017958194,vent voitur occas produit diver march,2,0,0,7,G,45,451,4511,44
X54014003089,apicultur,9,1,0,0,A,1,14,149,149
G69016531995,support patrimoin familial immobili san activi...,2,1,0,5,L,68,683,6832,78
C69018478430,commerc produit alimentair,2,0,0,6,G,47,471,4711,119
G45017062449,construct immeubl vu vent,2,1,0,5,F,41,411,4110,269


In [11]:
pytorch_classifier = trainer.train(df_train_py, Y, TEXT_FEATURE, categorical_features, params)

0it [00:00, ?it/s]/3 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s]


Time taken to process 1 batch: 0.024994373321533203


RuntimeError: SparseAdam does not support dense gradients, please consider Adam instead

In [None]:
from pytorch_classifier.pytorch_evaluator import PytorchEvaluator

evaluator = PytorchEvaluator(model=pytorch_classifier, tokenizer=trainer.tokenizer)

In [None]:
evaluation = evaluator.evaluate(
    df_test_py, Y, TEXT_FEATURE, categorical_features, 5
)

In [None]:
evaluation