In [8]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import pipeline

nltk.download('vader_lexicon')
nltk.download('stopwords')


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# Load the dataset
df = pd.read_csv("tweets-data.csv")

# Take a sample of 500 rows
df_sample = df.sample(n=500, random_state=42).copy()


In [10]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()                      # Lowercase
    text = re.sub(r"http\S+|www.\S+", "", text)   # Remove URLs
    text = re.sub(r"[^a-z\s]", "", text)          # Remove punctuation/numbers
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)                       # Recreate sentence from cleaned tokens


In [13]:
df_sample["cleaned_text"] = df_sample["Tweets"].apply(clean_text)

In [16]:
vader_analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    scores = vader_analyzer.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        label = 'positive'
    elif compound <= -0.05:
        label = 'negative'
    else:
        label = 'neutral'
    return label, compound

df_sample[["vader_label", "vader_score"]] = df_sample["cleaned_text"].apply(
    lambda x: pd.Series(vader_sentiment(x))
)

In [17]:
# Load a pre-trained transformer sentiment pipeline
transformer_pipeline = pipeline("sentiment-analysis")

def transformer_sentiment(text):
    # Tronquer à 512 caractères (en pratique, la limite réelle est en tokens, mais ça suffit ici)
    if len(text) > 512:
        text = text[:512]
    result = transformer_pipeline(text)[0]
    return result['label'].lower(), result['score']

df_sample[["transformer_label", "transformer_score"]] = df_sample["cleaned_text"].apply(
    lambda x: pd.Series(transformer_sentiment(x))
)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [18]:
print(df_sample[["Tweets", "cleaned_text",
                 "vader_label", "vader_score",
                 "transformer_label", "transformer_score"]].head())

                                                 Tweets  \
2899  Le #DessinDePresse de Sanaga : ls sont morts c...   
594   #Russia #Wagner #RussiaCivilWar https://t.co/P...   
2870  Exclusive content -https://t.co/oEiSIIB2Z1\n.\...   
52    Auch heute geht die politische Nachricht des T...   
1391  @crazyclipsonly Same type that would take a ho...   

                                           cleaned_text vader_label  \
2899  le dessindepresse de sanaga ls sont morts comm...    positive   
594                        russia wagner russiacivilwar     neutral   
2870  exclusive content cosplay japan titan titanics...    negative   
52    auch heute geht die politische nachricht des t...    negative   
1391  crazyclipsonly type would take homemade playst...     neutral   

      vader_score transformer_label  transformer_score  
2899       0.4767          negative           0.981537  
594        0.0000          negative           0.962062  
2870      -0.4404          negative           0