<a href="https://colab.research.google.com/github/MedAzzam/TALN-avec-spaCy/blob/main/NLN_avec_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prérequis

In [32]:
import spacy
print(spacy.__version__)

3.6.1


In [37]:
!python --version

Python 3.10.12


In [None]:
!pip install -U spacy
!python -m spacy download fr_core_news_sm

In [1]:
import spacy
nlp = spacy.load("fr_core_news_sm")

In [2]:
test = " Il y a eu une interruption de service chez Bouygues à Marseille."

## Tokenisation

In [3]:
def return_token(sentence):
 # Tokeniser la phrase
 doc = nlp(sentence)
 # Retourner le texte de chaque token
 return [X.text for X in doc]

In [4]:
return_token(test)

[' ',
 'Il',
 'y',
 'a',
 'eu',
 'une',
 'interruption',
 'de',
 'service',
 'chez',
 'Bouygues',
 'à',
 'Marseille',
 '.']

## Stopwords

In [6]:
!pip install nltk



In [8]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('french'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
clean_words = []
for token in return_token(test):
 if token not in stopWords:
    clean_words.append(token)
clean_words

[' ',
 'Il',
 'a',
 'interruption',
 'service',
 'chez',
 'Bouygues',
 'Marseille',
 '.']

## Tokenisation par phrases

In [10]:
test = """Il y a eu une interruption de service chez Bouygues à Marseille. Plus
de 300 000 utilisateurs ont été perturbés."""

In [11]:
def return_token_sent(sentence):
 # Tokeniser la phrase
 doc = nlp(sentence)
 # Retourner le texte de chaque phrase
 return [X.text for X in doc.sents]

In [12]:
return_token_sent(test)

['Il y a eu une interruption de service chez Bouygues à Marseille.',
 'Plus \nde 300 000 utilisateurs ont été perturbés.']

## Stemming

In [13]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer(language='french')

def return_stem(sentence):
 doc = nlp(sentence)
 return [stemmer.stem(X.text) for X in doc]

In [14]:
return_stem(test)

['il',
 'y',
 'a',
 'eu',
 'une',
 'interrupt',
 'de',
 'servic',
 'chez',
 'bouygu',
 'à',
 'marseil',
 '.',
 'plus',
 '\n',
 'de',
 '300',
 '000',
 'utilis',
 'ont',
 'été',
 'perturb',
 '.']

## Reconnaissance d’entités nommées (NER)

In [15]:
def return_NER(sentence):
 # Tokeniser la phrase
 doc = nlp(sentence)
 # Retourner le texte et le label pour chaque entité
 return [(X.text, X.label_) for X in doc.ents]

In [16]:
return_NER(test)

[('Bouygues', 'ORG'), ('Marseille', 'LOC')]

In [17]:
from spacy import displacy
doc = nlp(test)
displacy.render(doc, style="ent", jupyter=True)

## L’étiquetage morpho-syntaxique

In [18]:
def return_POS(sentence):
 # Tokeniser la phrase
 doc = nlp(sentence)
 # Retourner les étiquettes de chaque token
 return [(X, X.pos_) for X in doc]

In [19]:
return_POS(test)

[(Il, 'PRON'),
 (y, 'PRON'),
 (a, 'AUX'),
 (eu, 'VERB'),
 (une, 'DET'),
 (interruption, 'NOUN'),
 (de, 'ADP'),
 (service, 'NOUN'),
 (chez, 'ADP'),
 (Bouygues, 'PROPN'),
 (à, 'ADP'),
 (Marseille, 'PROPN'),
 (., 'PUNCT'),
 (Plus, 'ADV'),
 (, 'SPACE'),
 (de, 'ADP'),
 (300, 'DET'),
 (000, 'DET'),
 (utilisateurs, 'NOUN'),
 (ont, 'AUX'),
 (été, 'AUX'),
 (perturbés, 'VERB'),
 (., 'PUNCT')]

In [25]:
doc = nlp(test)
from spacy import displacy

# Customize options to display a shorter parse tree
options = {
    "compact": True,
    "color": "blue",
    "bg": "#ffffff",
    "font": "Arial",
    "arrow_stroke": 2,
}

# Render the visualization with customized options
displacy.render(doc, style="dep", options=options, jupyter=True)

In [26]:
# Render the visualization without customized options
displacy.render(doc, style="dep", jupyter=True)

In [21]:
import numpy as np
def return_word_embedding(sentence):
 # Tokeniser la phrase
 doc = nlp(sentence)
 # Retourner le vecteur lié à chaque token
 return [(X.vector) for X in doc]

In [22]:
return_word_embedding(test)

[array([-8.544528  , -3.054603  ,  0.29717004,  4.697859  ,  0.2045033 ,
        -6.040577  , -0.98203474, -6.155433  ,  0.92760277,  0.7205181 ,
        -3.4390583 ,  0.10784715, -4.3482733 , -1.1173272 , -1.8116753 ,
         3.118063  ,  2.7947009 , -2.2724197 ,  5.1902814 ,  0.15629542,
        -2.089182  , 10.679247  , -1.6806105 ,  0.3618018 ,  0.50679743,
        -2.2405941 ,  3.019611  ,  2.3046381 ,  4.390358  ,  4.7051105 ,
        -0.9612024 ,  4.0113926 , -0.5443527 , -2.5624614 , -6.1206903 ,
         5.3175955 , -5.272794  , -1.7475646 ,  3.0929432 ,  1.6841921 ,
        -3.5409484 , -2.0674014 , -0.9297133 ,  3.0606818 , -3.1174684 ,
         2.0287638 , -0.28428054,  5.3082685 ,  2.7839072 , -0.5261068 ,
         0.7381632 , -1.0836462 , -0.54644513,  2.9282875 ,  0.3272584 ,
         3.6220713 , -1.8379855 ,  0.47647208, -2.2563806 , -3.882668  ,
        -2.8455076 , -0.5094634 ,  1.086437  , -0.28054783, -1.8402176 ,
         0.08878589, -0.14184561,  3.113274  , -0.1

## Similarité entre phrases

In [27]:
def return_mean_embedding(sentence):
 # Tokeniser la phrase
 doc = nlp(sentence)
 # Retourner la moyenne des vecteurs pour chaque phrase
 return np.mean([(X.vector) for X in doc], axis=0)


In [28]:
test_2 = "Le réseau sera bientot rétabli à Marseille"
test_3 = "La panne réseau affecte plusieurs utilisateurs de l'opérateur"
test_4 = "Il fait 18 degrés ici"

In [30]:
return_mean_embedding(test_2)

array([-1.5669225 ,  1.0682589 , -0.19747798,  0.34100944,  0.17686068,
        1.1454934 ,  2.52931   , -0.93218625, -1.2625616 , -0.16557315,
        2.250037  , -0.16679008, -0.3819623 , -0.24992445, -1.6628351 ,
       -0.50121635,  1.2094582 , -0.90622914, -0.47798675,  0.44271487,
        0.26808637,  0.24215706, -0.04730952, -0.8760818 , -1.4719534 ,
       -1.3800961 , -0.11777245, -0.29856786,  1.0535063 ,  0.5944639 ,
       -2.2230773 , -0.11092787, -1.1340705 , -0.43835467,  3.4444382 ,
        0.39571667, -2.959204  ,  0.12932444, -0.22611086,  1.8903745 ,
       -1.7056124 , -0.54449207, -0.6181544 , -0.5221969 ,  1.5741627 ,
        1.3224299 , -1.7373139 , -0.71722066, -0.95829594,  1.3836578 ,
        1.1248062 , -0.3227238 , -1.7324276 , -1.1055199 ,  1.0209708 ,
        0.7332802 , -2.4553065 ,  0.21000133,  2.602818  , -0.02996942,
       -0.09842033,  0.03153365, -2.1569374 , -1.1710188 ,  1.7871361 ,
        2.273535  ,  0.9344458 ,  0.44925857,  1.1579664 , -0.62

In [31]:
return_mean_embedding(test_3)

array([-1.0035172e+00,  1.9396755e-01,  1.2461503e+00,  9.7936726e-01,
       -2.6032412e-01,  1.7242994e+00,  1.4579765e+00,  2.5656741e+00,
       -1.3673683e+00,  3.2544401e-03,  5.5628532e-01, -9.5343041e-01,
       -7.0897037e-01,  1.4908579e-01,  3.0535424e-01,  1.2345102e+00,
        2.0424662e+00, -7.9854906e-01, -7.5843674e-01,  1.5564165e+00,
       -4.5658058e-01,  1.8412259e-01,  1.2559664e+00, -4.1943693e-01,
        2.9945216e-01, -2.6578643e+00, -3.7749130e-02,  2.5930738e-01,
       -2.4696462e-01, -5.3657681e-01, -2.1946816e-01, -1.4659761e+00,
       -6.0993314e-01, -3.4474287e+00,  2.4473505e+00,  3.3329386e-01,
       -3.0355213e+00, -1.2638030e+00,  1.1228477e+00,  1.9607812e-01,
       -2.8478465e+00, -1.2139696e+00,  7.7146930e-01,  2.5737530e-01,
        1.5074213e+00,  3.3038998e-01, -3.3070159e+00, -7.4349827e-01,
       -1.1816099e+00,  2.1928358e-01, -2.1213940e-01,  1.1021017e+00,
        3.8161623e-01,  5.5089998e-01, -3.7666410e-01, -7.9193962e-01,
      