In [6]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Doc

# Charger le modèle de langue spaCy
nlp = spacy.load("en_core_web_sm")

In [8]:
with open("./td2/reaganomics.txt", "r", encoding="latin-1") as file:
    text = file.read()

In [9]:
# 2. Tokeniser les mots
doc = nlp(text)

In [11]:
# 3. Éliminer les stop words
filtered_tokens = [token.text for token in doc if not token.is_stop]
"""
technique that involves the identification and classification of named entities
(such as persons, organizations, locations, medical codes, time expressions, quantities,
monetary values, percentages, etc.) in text data. The goal of NER is to extract structured
information from unstructured text.
"""

'\ntechnique that involves the identification and classification of named entities\n(such as persons, organizations, locations, medical codes, time expressions, quantities,\nmonetary values, percentages, etc.) in text data. The goal of NER is to extract structured\ninformation from unstructured text.\n'

In [14]:
# 5. Appliquer le Matcher sur un paragraphe

matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "supply"}, {"LOWER": "side"}]
matcher.add("SupplySide", [pattern])

paragraph = "Reaganomics was a supply-side economic policy."
paragraph_doc = nlp(paragraph)

matches = matcher(paragraph_doc)
matched_phrases = [paragraph_doc[start:end].text for _, start, end in matches]

In [16]:
# Afficher les résultats
#print("1. Texte importé:")
#print(text)

In [21]:
# 2. Tokeniser les mots
doc_tokens = [(i, token.text) for i, token in enumerate(doc)]
formatted_tokens = '\n'.join([f"{index + 1}. {token}" for index, token in doc_tokens[:10]])

print("\n2. Premiers 10 mots tokenisés avec index:")
print(formatted_tokens)



2. Premiers 10 mots tokenisés avec index:
1. REAGANOMICS
2. 

3. https://en.wikipedia.org/wiki/Reaganomics
4. 


5. Reaganomics
6. (
7. a
8. portmanteau
9. of
10. [


In [24]:
# 3. Mots sans stop words
filtered_tokens_with_index = [(i, token) for i, token in enumerate(filtered_tokens, start=1)]
formatted_filtered_tokens = '\n'.join([f"{index}. {token}" for index, token in filtered_tokens_with_index[:10]])

print("\n3. Mots sans stop words avec index:")
print(formatted_filtered_tokens)



3. Mots sans stop words avec index:
1. REAGANOMICS
2. 

3. https://en.wikipedia.org/wiki/Reaganomics
4. 


5. Reaganomics
6. (
7. portmanteau
8. [
9. Ronald
10. ]


In [28]:
# 4. Entités nommées (NER)
ner_entities_with_index = [(i, ent.text, ent.label_) for i, ent in enumerate(doc.ents, start=1)]
formatted_ner_entities = '\n'.join([f"{index}. Texte: {text}, Label: {label}" for index, text, label in ner_entities_with_index[:10]])

# Print the named entities with indices
print("\n4. Entités nommées (NER) avec index:")
print(formatted_ner_entities)



4. Entités nommées (NER) avec index:
1. Texte: REAGANOMICS, Label: ORG
2. Texte: Ronald] Reagan, Label: PERSON
3. Texte: Paul, Label: PERSON
4. Texte: U.S., Label: GPE
5. Texte: Ronald Reagan, Label: PERSON
6. Texte: the 1980s, Label: DATE
7. Texte: four, Label: CARDINAL
8. Texte: Reagan, Label: PERSON
9. Texte: Reaganomics, Label: NORP
10. Texte: the decades, Label: DATE


In [26]:
print("\n5. Phrases correspondant au pattern 'Supply Side':")
print(matched_phrases)


5. Phrases correspondant au pattern 'Supply Side':
[]


**---------------------------------------------------------**\
**---------------------** **TD3** **------------------------------**\
**---------------------------------------------------------**


In [1]:
import re
import spacy
from spacy.matcher import Matcher
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import stopwords

In [5]:
#!python -m spacy download fr_core_news_sm

In [6]:
# Load the spaCy model
nlp = spacy.load("fr_core_news_sm")

In [8]:
# Load the text from "TextGPT.txt"
with open("./td3/TextGPT.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [9]:
# Question 1: Find all occurrences of the word "gpt" using the re library
gpt_occurrences = re.findall(r'\bgpt\b', text, flags=re.IGNORECASE)
print("\nQuestion 1:")
print("Occurrences of 'gpt':", gpt_occurrences)


Question 1:
Occurrences of 'gpt': ['GPT', 'GPT']


In [10]:
# Question 2: Split the text into paragraphs and count the number of sentences in each paragraph
paragraphs = re.split(r'\n\n', text)
num_sentences_per_paragraph = [len(list(nlp(paragraph).sents)) for paragraph in paragraphs]
print("\nQuestion 2:")
print("Number of sentences in each paragraph:", num_sentences_per_paragraph)


Question 2:
Number of sentences in each paragraph: [9, 16, 14, 8, 34, 36]


In [11]:
# Question 3: Tokenize the text without using split and count the number of words in each paragraph
paragraph_tokens = [list(nlp(paragraph)) for paragraph in paragraphs]
num_words_per_paragraph = [len(tokens) for tokens in paragraph_tokens]
print("\nQuestion 3:")
print("Number of words in each paragraph:", num_words_per_paragraph)


Question 3:
Number of words in each paragraph: [85, 150, 170, 112, 260, 326]


In [21]:
# Question 4: Eliminate stop words
stop_words = set(stopwords.words("french"))
filtered_paragraphs = [
    [token.text for token in tokens if token.text.lower() not in stop_words]
    for tokens in paragraph_tokens
]
print("\nQuestion 4:")
for _ in range(len(filtered_paragraphs)):
    print("Paragraphs without stop words:", filtered_paragraphs[_])


Question 4:
Paragraphs without stop words: ['ChatGPT', ',', 'which', 'stands', 'for', 'Chat', 'Generative', 'Pre', '-', 'trained', 'Transformer', ',', 'is', 'a', 'chatbot', 'developed', 'by', 'OpenAI', '.', 'ChatGPT', 'is', 'built', 'top', 'of', "OpenAI'", 'GPT-3.5', 'family', 'of', 'large', 'language', 'models', ',', 'and', 'is', 'fine', '-', 'tuned', 'with', 'both', 'supervised', 'and', 'reinforcement', 'learning', 'techniques', '.', 'ChatGPT', 'was', 'launched', 'a', 'prototype', 'in', 'November', '2022', ',', 'and', 'quickly', 'garnered', 'attention', 'for', 'its', 'detailed', 'responses', 'and', 'articulate', 'answers', 'across', 'many', 'domains', 'of', 'knowledge', '.', 'Its', 'uneven', 'factual', 'accuracy', 'was', 'identified', 'a', 'significant', 'drawback.[1', ']']
Paragraphs without stop words: ['ChatGPT', 'was', 'fine', '-', 'tuned', 'top', 'of', 'GPT-3.5', 'using', 'supervised', 'learning', 'well', 'reinforcement', 'learning.[2', ']', 'Both', 'approaches', 'used', 'human

In [22]:
# Question 5: Apply POS tagging to the first paragraph
pos_tags_first_paragraph = [token.pos_ for token in paragraph_tokens[0]]
print("\nQuestion 5:")
print("POS tags for the first paragraph:", pos_tags_first_paragraph)


Question 5:
POS tags for the first paragraph: ['VERB', 'PUNCT', 'NOUN', 'ADJ', 'ADP', 'NOUN', 'ADJ', 'PROPN', 'NOUN', 'NOUN', 'PROPN', 'PUNCT', 'ADP', 'AUX', 'VERB', 'PROPN', 'ADP', 'ADV', 'PUNCT', 'VERB', 'ADP', 'NOUN', 'PRON', 'AUX', 'PRON', 'VERB', 'AUX', 'PROPN', 'PROPN', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'PUNCT', 'X', 'VERB', 'NOUN', 'PROPN', 'ADJ', 'PROPN', 'PROPN', 'X', 'X', 'VERB', 'ADJ', 'ADJ', 'PUNCT', 'NOUN', 'X', 'PROPN', 'PRON', 'AUX', 'VERB', 'X', 'PROPN', 'NUM', 'PUNCT', 'X', 'X', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'X', 'ADJ', 'ADJ', 'ADJ', 'NOUN', 'ADJ', 'ADP', 'NOUN', 'PUNCT', 'ADP', 'PROPN', 'NOUN', 'ADJ', 'AUX', 'ADJ', 'PRON', 'VERB', 'VERB', 'NOUN', 'PUNCT']


In [23]:
# Question 6: Apply Bag of Words (BOW) to each paragraph
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform([' '.join(tokens) for tokens in filtered_paragraphs])
print("\nQuestion 6:")
print("Bag of Words matrix:")
print(bow_matrix.toarray())
"""

"""


Question 6:
Bag of Words matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 2 1 1]
 [0 1 1 ... 2 0 0]]


Applying Bag of Words (BOW) to each paragraph involves representing the text data in a numerical format that machine learning algorithms can understand. BOW is a common technique used in natural language processing and text analysis.

Here's a step-by-step explanation of how BOW works:

1. **Tokenization:**
   Break each paragraph into individual words or tokens. This process involves separating the text into meaningful units, such as words.

2. **Vocabulary Creation:**
   Create a vocabulary, which is a unique set of all the words in the entire collection of paragraphs. Each word in the vocabulary is assigned a unique index.

3. **Vectorization:**
   For each paragraph, create a vector (array) of numerical values based on the words in the paragraph and their frequency in the vocabulary. The length of the vector is equal to the size of the vocabulary.

   - If a word from the vocabulary is present in the paragraph, the corresponding element in the vector is set to the frequency of that word in the paragraph.
   - If a word is not present, the corresponding element is set to zero.

   This process results in a numerical representation of each paragraph in the form of a vector. Each element in the vector represents the frequency of a particular word from the vocabulary in the corresponding paragraph.

Here's a simplified example:

Consider the vocabulary: ["apple", "orange", "banana"]

Paragraph 1: "I like apples and bananas."
Vector for Paragraph 1: [1, 0, 1]

Paragraph 2: "I like oranges."
Vector for Paragraph 2: [0, 1, 0]

This way, you've converted text data into a format that machine learning algorithms can use for tasks like classification or clustering. The BOW representation doesn't capture word order or semantics but focuses on the occurrence of words in a document.

In [24]:
# Question 7: Calculate TF-IDF for each word
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in filtered_paragraphs])
print("\nQuestion 7:")
print("TF-IDF matrix:")
print(tfidf_matrix.toarray())


Question 7:
TF-IDF matrix:
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.11872813 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.10092691 0.06153972 0.06153972]
 [0.         0.04684914 0.04684914 ... 0.07683394 0.         0.        ]]


In [25]:
# Question 8: Load GloVe embeddings into a dictionary
glove_embeddings = {}
with open("D:/NLP/td3/glove.6B.50d (1).txt", "r", encoding="utf-8") as glove_file:
    for line in glove_file:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

# Get the first 10 and next 10 words of the first paragraph
first_10_words = filtered_paragraphs[0][:10]
next_10_words = filtered_paragraphs[0][10:20]

# Get the indices of the words in the TF-IDF matrix
indices_first_10 = [tfidf_vectorizer.vocabulary_[word] for word in first_10_words if word in tfidf_vectorizer.vocabulary_]
indices_next_10 = [tfidf_vectorizer.vocabulary_[word] for word in next_10_words if word in tfidf_vectorizer.vocabulary_]

# Get TF-IDF scores for the first 10 and next 10 words
tfidf_scores_first_10 = tfidf_matrix[0, indices_first_10].toarray()[0]
tfidf_scores_next_10 = tfidf_matrix[0, indices_next_10].toarray()[0]

# Get GloVe embeddings for the first 10 and next 10 words from TF-IDF matrix
glove_embeddings_first_10 = np.array([glove_embeddings.get(word, np.zeros(50)) for word in tfidf_vectorizer.get_feature_names_out()[indices_first_10]])
glove_embeddings_next_10 = np.array([glove_embeddings.get(word, np.zeros(50)) for word in tfidf_vectorizer.get_feature_names_out()[indices_next_10]])

# Calculate TF-IDF weighted GloVe vectors for each word
weighted_vectors_first_10 = glove_embeddings_first_10 * tfidf_scores_first_10[:, np.newaxis]
weighted_vectors_next_10 = glove_embeddings_next_10 * tfidf_scores_next_10[:, np.newaxis]

# Calculate cosine similarity between average vectors
avg_vector_first_10 = np.mean(weighted_vectors_first_10, axis=0)
avg_vector_next_10 = np.mean(weighted_vectors_next_10, axis=0)

similarity = cosine_similarity(avg_vector_first_10.reshape(1, -1), avg_vector_next_10.reshape(1, -1))

print("\nQuestion 9:")
print("Cosine similarity between first 10 and next 10 words based on TF-IDF-weighted GloVe vectors:", similarity[0][0])



Question 9:
Cosine similarity between first 10 and next 10 words based on TF-IDF-weighted GloVe vectors: 0.8737339172893369


**EXAM 0**

In [32]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

import spacy

In [34]:
# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Question 1: Trouver le nombre total de mots avec NLTK
with open('./td2/reaganomics.txt', 'r', encoding='latin-1') as file:
    text = file.read()

tokens = word_tokenize(text)
num_words = len(tokens)
print("Nombre total de mots:", num_words)


Nombre total de mots: 6181


In [36]:
# Question 2: Identifier et afficher les 10 premières entités nommées avec spaCy
doc = nlp(text)
for i, ent in enumerate(doc.ents[:10]):
    print(f"Entité nommée {i+1}: {ent.text}, Type: {ent.label_}")


Entité nommée 1: REAGANOMICS
https://en.wikipedia.org/wiki/Reaganomics

Reaganomics, Type: ORG
Entité nommée 2: Ronald] Reagan, Type: PERSON
Entité nommée 3: Paul Harvey)[1, Type: PERSON
Entité nommée 4: U.S., Type: GPE
Entité nommée 5: Ronald Reagan, Type: PERSON
Entité nommée 6: the 1980s, Type: DATE
Entité nommée 7: four, Type: CARDINAL
Entité nommée 8: Reagan, Type: PERSON
Entité nommée 9: Reaganomics, Type: PERSON
Entité nommée 10: the decades, Type: DATE


In [39]:
# Question 3: Appliquer la lemmatisation avec spaCy pour les 10 premiers tokens avec index
lemmatized_tokens_with_index = [(i, token.lemma_) for i, token in enumerate(doc[:10])]
for index, (i, token_lemma) in enumerate(lemmatized_tokens_with_index):
    print(f"{index + 1}. Token {i}: {token_lemma}")


1. Token 0: REAGANOMICS
2. Token 1: 

3. Token 2: https://en.wikipedia.org/wiki/Reaganomics
4. Token 3: 


5. Token 4: Reaganomics
6. Token 5: (
7. Token 6: a
8. Token 7: portmanteau
9. Token 8: of
10. Token 9: [


In [40]:
# Question 4: Afficher les 10 premiers mots et vecteurs avec spaCy
for i, token in enumerate(doc[:10]):
    print(f"Mot {i + 1}: {token.text}, Vecteur: {token.vector}")

Mot 1: REAGANOMICS, Vecteur: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Mot 2: 
, Vecteur: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [49]:
sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(text)
print("Score de sentiment:", sentiment_scores)


Score de sentiment: {'neg': 0.07, 'neu': 0.839, 'pos': 0.091, 'compound': 0.9982}


In [50]:
# Question 6: Analyse de similarité de document avec spaCy
paragraphs = [sent.text for sent in doc.sents]
similarity_score = nlp(paragraphs[0]).similarity(nlp(paragraphs[1]))
print("Score de similarité entre les deux premiers paragraphes:", similarity_score)

Score de similarité entre les deux premiers paragraphes: 0.7662508234165553
