# Sentiment analysis 

## 1. Textblob-FR

Documentation: https://textblob.readthedocs.io/en/dev/

### Imports

In [41]:
from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer
#tokenisation pour afficher les phrases et pas le faire couper automatiquement au .
import nltk
from nltk.tokenize import sent_tokenize
import random
#affichage tabulaire
import pandas as pd
import re
import os

### Création d'une fonction `get_sentiment`

In [51]:
# Initialisation
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())
# Télécharger les ressources NLTK pour la tokenisation
nltk.download('punkt')
#Fonctions utilitaires
# ---------------------------

def is_readable(sentence):
    """Renvoie True si la phrase est lisible (assez de lettres et peu de caractères spéciaux)."""
    letters_count = len(re.findall(r'[a-zA-Zàâäéèêëïîôöùûüÿç]', sentence))
    symbols_count = len(re.findall(r'[^a-zA-Z0-9\sàâäéèêëïîôöùûüÿç,.!?]', sentence))
    digits_count = len(re.findall(r'\d', sentence))
    
    if letters_count < 8:           # phrases trop courtes -> ignorer
        return False
    if symbols_count > 5:           # trop de symboles -> ignorer
        return False
    if digits_count > letters_count / 2:  # trop de chiffres par rapport aux lettres -> ignorer
        return False
    return True

def format_polarity(p):
    """Formate la polarité en pourcentage et ajoute + / -"""
    if p > 0:
        return f"{p*100:.0f}% positive"
    elif p < 0:
        return f"{abs(p*100):.0f}% negative"
    else:
        return "neutral"

def format_subjectivity(s):
    """Formate la subjectivité"""
    return f"{s*100:.0f}%" if s != 0 else "perfectly objective"

def get_sentiment(input_text):
    blob = tb(input_text)
    polarity, subjectivity = blob.sentiment
    polarity_perc = f"{100*abs(polarity):.0f}"
    subjectivity_perc = f"{100*subjectivity:.0f}"
    if polarity > 0:
        polarity_str = f"{polarity_perc}% positive"
    elif polarity < 0:
        polarity_str = f"{polarity_perc}% negative"
    else:
        polarity_str = "neutral"
    if subjectivity > 0:
       subjectivity_str = f"{subjectivity_perc}% subjective"
    else:
       subjectivity_str = "perfectly objective"

    
    print(f"This text is {polarity_str} and {subjectivity_str}.")

[nltk_data] Downloading package punkt to /Users/ilaria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Chemin courant
print("Répertoire courant :", os.getcwd())

# Contenu du dossier courant
print("Contenu :", os.listdir())

# Contenu du dossier parent
print("Contenu du dossier parent :", os.listdir(".."))


Répertoire courant : /Users/ilaria/Desktop/STIC/traitement_auto_corpus/tac/tps/tp2
Contenu : ['s4_sentiment.ipynb', 's3_ner.ipynb', 'README.md', 's1_keywords.ipynb', 's2_wordcloud.ipynb']
Contenu du dossier parent : ['tp4', 'tp3', 'tp2', 'tp1']


charger le texte complet de l'année choisie

In [53]:
text = open("/Users/ilaria/Desktop/STIC/traitement_auto_corpus/tac/data/all.txt", encoding="utf-8").read()
# Séparer le texte en phrases avec nltk
sentences = sent_tokenize(text, language='french')
# Filtrer phrases trop courtes ou sans lettres
sentences = [s.strip() for s in sentences if len(s.strip()) > 5 and any(c.isalpha() for c in s)]
#fonction de filtrage pour ameliorer choix phrases
filtered_sentences = [s.strip() for s in sentences if len(s.strip()) > 5 and is_readable(s)]
#selectionner 10 phrases aleatoires
import random
sample_sentences = random.sample(filtered_sentences, 10)
# Créer la liste pour stocker les résultats
results = []
for sent in sample_sentences:
    blob = tb(sent)
    polarity, subjectivity = blob.sentiment
    results.append({
        "phrase": sent,
        "polarity": f"{100*abs(polarity):.0f}%" if polarity != 0 else "neutral",
        "subjectivity": f"{100*subjectivity:.0f}%" if subjectivity != 0 else "perfectly objective"
    })
df = pd.DataFrame(results)

print(df)


                                              phrase polarity  \
0               Gustave GILLEKEN — : » L. C„ Laekan.  neutral   
1                  ROYAL BOURSE: Epaves de la Rue 8.      20%   
2                    à la plaine militaire, à Evere.      13%   
3                                          l’an Apr.  neutral   
4                        vendeuses pr l’Exposltloon.  neutral   
5                                    Enf, non admis.       1%   
6               rittPlUO iLLICL .99, rue Royale, 99.      20%   
7                          Rossel 5791 D Achète pet.  neutral   
8  6,50 le flacon* Oêpflt général pour (a Belgiqu...  neutral   
9          So u ‘' e "_ I par les immeubles ou terr.  neutral   

          subjectivity  
0  perfectly objective  
1                  10%  
2                  10%  
3  perfectly objective  
4  perfectly objective  
5  perfectly objective  
6                  10%  
7  perfectly objective  
8  perfectly objective  
9  perfectly objective  


### Analyser le sentiment d'une phrase

In [46]:
get_sentiment("Ce journal est vraiment super intéressant.")

This text is 65% positive and 75% subjective.


In [47]:
get_sentiment("Cette phrase est négative et je ne suis pas content !")

This text is 41% negative and 60% subjective.


## 2. Utilisation de transformers

Documentation: https://github.com/TheophileBlard/french-sentiment-analysis-with-bert

**!!** Si le code ne tourne pas sur votre machine, vous pouvez le tester directement sur Google Colab en utilisant [ce lien](https://colab.research.google.com/github/TheophileBlard/french-sentiment-analysis-with-bert/blob/master/colab/french_sentiment_analysis_with_bert.ipynb) **!!**

Le modèle peut également être testé en ligne sur [HuggingFace](https://huggingface.co/tblard/tf-allocine)

### Installation des librairies et imports

In [1]:
%pip install tensorflow
%pip install sentencepiece
%pip install transformers
%pip install tf_keras
%pip install torch
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline

Collecting tensorflow
  Using cached tensorflow-2.20.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.5 kB)
Collecting protobuf>=5.28.0 (from tensorflow)
  Using cached protobuf-6.33.0-cp39-abi3-macosx_10_9_universal2.whl.metadata (593 bytes)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Using cached tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting keras>=3.10.0 (from tensorflow)
  Using cached keras-3.11.3-py3-none-any.whl.metadata (5.9 kB)
Collecting numpy>=1.26.0 (from tensorflow)
  Using cached numpy-2.3.4-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Using cached tensorflow-2.20.0-cp311-cp311-macosx_12_0_arm64.whl (200.5 MB)
Using cached tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
Using cached keras-3.11.3-py3-none-any.whl (1.4 MB)
Using cached numpy-2.3.4-cp311-cp311-macosx_14_0_arm64.whl (5.4 MB)
Using cached protobuf-6.33.0-cp39-abi3-macosx_10_9_universal2.whl (427 kB)
Installing collected packages: protobuf, numpy, tensorboard, keras, tensorflow
[2K 

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/ilaria/Desktop/STIC/traitement_auto_corpus/tac/.venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/ilaria/Desktop/STIC/traitement_auto_corpus/tac/.venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/ilaria/De

NotFoundError: dlopen(/Users/ilaria/Desktop/STIC/traitement_auto_corpus/tac/.venv/lib/python3.11/site-packages/tensorflow-plugins/libmetal_plugin.dylib, 0x0006): Library not loaded: @rpath/_pywrap_tensorflow_internal.so
  Referenced from: <8B62586B-B082-3113-93AB-FD766A9960AE> /Users/ilaria/Desktop/STIC/traitement_auto_corpus/tac/.venv/lib/python3.11/site-packages/tensorflow-plugins/libmetal_plugin.dylib
  Reason: tried: '/Users/ilaria/Desktop/STIC/traitement_auto_corpus/tac/.venv/lib/python3.11/site-packages/tensorflow-plugins/../_solib_darwin_arm64/_U@local_Uconfig_Utf_S_S_C_Upywrap_Utensorflow_Uinternal___Uexternal_Slocal_Uconfig_Utf/_pywrap_tensorflow_internal.so' (no such file), '/Users/ilaria/Desktop/STIC/traitement_auto_corpus/tac/.venv/lib/python3.11/site-packages/tensorflow-plugins/../_solib_darwin_arm64/_U@local_Uconfig_Utf_S_S_C_Upywrap_Utensorflow_Uinternal___Uexternal_Slocal_Uconfig_Utf/_pywrap_tensorflow_internal.so' (no such file)

### Chargement du modèle

In [6]:
tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine", use_pt=True)
model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine")

sentiment_analyser = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

: 

### Analyser le sentiment d'une phrase

In [39]:
sentiment_analyser("Ce journal est vraiment super intéressant.")

In [None]:
sentiment_analyser("Cette phrase est négative et je ne suis pas content !")