In [2]:
from transformers import pipeline,AutoTokenizer, TFCamembertForSequenceClassification
import pandas as pd
from tqdm import tqdm
import numpy as np

df = pd.read_csv('../data/avis/df_clean_noYC_lemma.csv')
df.head()

: 

In [None]:
import sentencepiece
# chargement du modèle
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine")
model = TFCamembertForSequenceClassification.from_pretrained("tblard/tf-allocine")

classifier=  pipeline("text-classification", model = model, tokenizer= tokenizer)
#tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512,'return_tensors':'pt'}

In [None]:
outputs = classifier(df['text_total'][0])
pd.DataFrame(outputs)

In [None]:
MAX_LENGTH = 512  # nombre de mots max pour Camembert

sentiment = pd.DataFrame()
df['label'] =np.nan
df['score'] = np.nan
for i, text in tqdm(enumerate(df.text_total), total=df.shape[0]):
    # Tronquer le texte s'il est trop long
    text = text[:MAX_LENGTH]
    try:
        temp = classifier(text)
        temp = pd.DataFrame(temp)
        df['label'][i] = temp['label'][0]
        df['score'][i] = temp['score'][0]
    except Exception as e:
        print(f"Erreur lors du traitement du texte à l'indice {i}. Erreur: {e}")

In [None]:
# transformation des scores pour intégrer la polarité et la confiance du modèle
# vers -1 est très probablement négativ, vers +1 est très probablement positif
df['label'] = df.label.replace("NEGATIVE", -1)
df['label'] = df.label.replace("POSITIVE", 1)
df['sentiment_norm'] = df["label"] * df["score"]
df.head()

In [None]:
df.to_csv("../data/avis/gen_clean_lemma_sent_noYc_.csv")

In [None]:
### Nettoyage
df["text_total"] = df["text_total"].str.replace("[^\w\s]", " ")
df["text_total"] = df["text_total"].str.replace("  ", " ")
### No maj
df["text_total"] = df["text_total"].str.lower()
### Suppression espaces inutiles
df["text_total"] = df["text_total"].str.strip()


In [None]:
import time
#from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
#import pandas as pd

# Load model and tokenizer
"""
tokenizer = AutoTokenizer.from_pretrained("camembert-base")
model = AutoModelForSequenceClassification.from_pretrained("camembert-base")
classifier = pipeline("text-classification", truncation=True, padding=True, max_length=512)
"""

texts = df['text_total'].tolist()

# fourchette de batches à tester
batch_sizes = [16, 32, 35, 38,40] 

# nombre de paquets à envoyer par test
num_batches_to_test = 2

# Enregistrer la performance
performance = {}

for batch_size in batch_sizes:
    start_time = time.time()

    # test sur quelques paquets
    for i in range(0, min(num_batches_to_test * batch_size, len(texts)), batch_size):
        batch = texts[i:i+batch_size]
        classifier(batch)

    end_time = time.time()
    time_taken = end_time - start_time
    observations_per_second = (batch_size * num_batches_to_test) / time_taken

    performance[batch_size] = observations_per_second
    print(f"Batch Size: {batch_size}, Observations per second: {observations_per_second}")

# trouver le batch size optimal
optimal_batch_size = max(performance, key=performance.get)
print(f"Optimal Batch Size: {optimal_batch_size}")

In [None]:
# Traitement du dataset avec la taille des paquets optimale
from tqdm.auto import tqdm
results = []
n_batches = len(texts)//optimal_batch_size
for i in tqdm(range(0, n_batches), total=n_batches):
    start_time = time.time()
    batch = texts[i:i+optimal_batch_size]
    predictions = classifier(batch)
    results.extend(predictions)
    end_time =time.time()
    time_taken = round(end_time - start_time, 1)
    remaining_t = round(((n_batches - i+1)*time_taken)/60,1) 
    pcent = round(((i+1)/n_batches*100),2)
    print(f"""batch {i+1} of {n_batches} in {time_taken} secs, {pcent}% done, {remaining_t} min to completion""")

# passer les dernières cellules < batch size
reliquat = len(df)%n_batches
batch = texts[-reliquat:]
predictions = classifier(batch)
results.extend(predictions)

In [None]:
# ajouter les résultats à la dataframe
df['label'] = [result['label'] for result in results]
df['score'] = [result['score'] for result in results]
# transformation des scores pour intégrer la polarité et la confiance du modèle
# vers -1 est très probablement négativ, vers +1 est très probablement positif
df['label'] = df.label.replace("NEGATIVE", -1)
df['label'] = df.label.replace("POSITIVE", 1)
df['sentiment_norm'] = df["label"] * df["score"]
df.head()