In [1]:
#pip install sentencepiece
#pip install --upgrade torch
#pip install --upgrade "optree>=0.13.0"
#ip install transformers torch


In [1]:
import transformers
print(transformers.__version__)  # Doit être ≥ 4.0

4.48.3


In [2]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from transformers import MarianMTModel, MarianTokenizer

import sentencepiece
import torch
import pandas as pd
import re

In [3]:

model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name)

tweet = "I love artificial intelligence!"
tokens = tokenizer(tweet, return_tensors="pt")

# Traduction en français
translated_tokens = model.generate(**tokens, forced_bos_token_id=tokenizer.get_lang_id("fr"))
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

print(translated_text)


J’adore l’intelligence artificielle !


In [4]:
column_names = ['class', 'id', 'date', 'query', 'name', 'tweet']
df = pd.read_csv("dataSet140.csv", sep=',', encoding='ISO-8859-1',names=column_names, nrows=1000)
df.head()

Unnamed: 0,class,id,date,query,name,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
def cleanText(text):
    clean_text = re.sub(r'@\w+|http\S+', '', text)
    return clean_text.strip()

In [7]:
df.loc[:, "tweet"] = df["tweet"].apply(cleanText)
df.head()


Unnamed: 0,class,id,date,query,name,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"- Awww, that's a bummer. You shoulda got Davi..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,I dived many times for the ball. Managed to sa...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"no, it's not behaving at all. i'm mad. why am ..."


In [8]:
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(text):
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    translated_tokens = model.generate(**tokens)
    return tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

df["tweet_fr"] = df["tweet"].astype(str).apply(translate_text)

print(df[["tweet", "tweet_fr"]].head())

df.to_csv("sentiment140_fr_2000.csv", index=False, encoding="utf-8")



                                               tweet  \
0  - Awww, that's a bummer.  You shoulda got Davi...   
1  is upset that he can't update his Facebook by ...   
2  I dived many times for the ball. Managed to sa...   
3     my whole body feels itchy and like its on fire   
4  no, it's not behaving at all. i'm mad. why am ...   

                                            tweet_fr  
0  Vous devriez avoir David Carr du troisième jou...  
1  est contrarié qu'il ne puisse pas mettre à jou...  
2  J'ai plongé plusieurs fois pour la balle. A ré...  
3  Tout mon corps se sent démangeant et comme sur...  
4  Non, ça ne se passe pas du tout. Je suis en co...  


In [9]:
model_name = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def batch_translate(texts, batch_size=32):
    translations = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size] 
        tokens = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256)
        
        # Déplacer les tensors sur GPU si dispo
        if torch.cuda.is_available():
            model.to("cuda")
            tokens = {key: val.to("cuda") for key, val in tokens.items()}

        translated_tokens = model.generate(**tokens)  # Traduire
        batch_translations = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
        translations.extend(batch_translations)

    return translations



In [10]:
df["tweet_fr"] = batch_translate(df["tweet"].tolist())
df.to_csv("sentiment140_fr_2000.csv", index=False, encoding="utf-8")
print(df[["tweet", "tweet_fr"]].head())

                                               tweet  \
0  - Awww, that's a bummer.  You shoulda got Davi...   
1  is upset that he can't update his Facebook by ...   
2  I dived many times for the ball. Managed to sa...   
3     my whole body feels itchy and like its on fire   
4  no, it's not behaving at all. i'm mad. why am ...   

                                            tweet_fr  
0  Vous devriez avoir David Carr du troisième jou...  
1  est contrarié qu'il ne puisse pas mettre à jou...  
2  J'ai plongé plusieurs fois pour la balle. A ré...  
3  Tout mon corps se sent démangeant et comme sur...  
4  Non, ça ne se passe pas du tout. Je suis en co...  
