# Partie 1 : Démarrage

In [1]:
import pandas as pd
import sys
from datetime import datetime
from Corpus import Document, Author, DocumentFactory, Corpus, SearchEngine
import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm

#  chemin du répertoire contenant les modules
sys.path.append('../V3')

# 1.1 Charger le jeu de données
df = pd.read_csv('discours_US.csv', delimiter='\t')

# 1.2 Vérifier la distribution des auteurs
print(df['speaker'].value_counts())

# 1.3 Créer un objet Corpus et ajouter les documents
corpus = Corpus("Discours US")

# Ajouter les documents au corpus
for index, row in df.iterrows():
    doc = DocumentFactory.create_document(row)
    corpus.add(doc)

# le nombre de documents dans le corpus
print(f"Nombre de documents dans le corpus : {corpus.ndoc}")

# 1.4 Tester les fonctions search et concorde
print("Résultats de la recherche pour le mot 'freedom':")
print(corpus.search("freedom"))

print("\nConcordances pour le mot 'freedom':")
print(corpus.concorde("freedom", context_size=30))

speaker
CLINTON    93
TRUMP      71
Name: count, dtype: int64
Nombre de documents dans le corpus : 164
Résultats de la recherche pour le mot 'freedom':
[Document(titre=Remarks at Texas Southern University in Houston, auteur=CLINTON, date=2015/06/04, texte=Wow! Thank you so very much. I...), Document(titre=Remarks in a Campaign \Kickoff\" Speech in New York City", auteur=CLINTON, date=2015/06/13, texte=Thank you! Oh, thank you all! ...), Document(titre=Remarks and a Question and Answer Session at the Brookings Institution in Washington, DC, auteur=CLINTON, date=2015/09/09, texte=Thank you so much, Strobe. Tha...), Document(titre=Remarks and a Question and Answer Session at the Council on Foreign Relations in New York City, auteur=CLINTON, date=2015/11/19, texte=Thank you. Thank you very much...), Document(titre=Interview with Jake tapper of CNN's \State of the Union\"", auteur=CLINTON, date=2016/01/17, texte=We're joined right off the bat...), Document(titre=Remarks at the AIPAC Policy 

# Partie 2 :  moteur de recherche

In [2]:
from Corpus import SearchEngine

# 2.1 Importer et initialiser la classe SearchEngine
search_engine = SearchEngine(corpus)

# 2.2 Tester la fonction search
print("Résultats de la recherche pour le mot 'freedom' avec le moteur de recherche:")
print(search_engine.search("freedom", top_n=10))

Résultats de la recherche pour le mot 'freedom' avec le moteur de recherche:
[Document(titre=Remarks at Texas Southern University in Houston, auteur=CLINTON, date=2015/06/04, texte=Wow! Thank you so very much. I...), Document(titre=Remarks in a Campaign \Kickoff\" Speech in New York City", auteur=CLINTON, date=2015/06/13, texte=Thank you! Oh, thank you all! ...), Document(titre=Remarks and a Question and Answer Session at the Brookings Institution in Washington, DC, auteur=CLINTON, date=2015/09/09, texte=Thank you so much, Strobe. Tha...), Document(titre=Remarks and a Question and Answer Session at the Council on Foreign Relations in New York City, auteur=CLINTON, date=2015/11/19, texte=Thank you. Thank you very much...), Document(titre=Interview with Jake tapper of CNN's \State of the Union\"", auteur=CLINTON, date=2016/01/17, texte=We're joined right off the bat...), Document(titre=Remarks at the AIPAC Policy Conference in Washington, DC, auteur=TRUMP, date=2016/03/21, texte=Good eve

# Partie 3 : Petite interface

In [4]:
import ipywidgets as widgets
from IPython.display import display

# Créer les widgets
label = widgets.Label(value="Moteur de recherche de discours US")
text_input = widgets.Text(description="Mots-clés:")
int_slider = widgets.IntSlider(description="Nombre de documents:", min=1, max=20, value=10)

# Afficher les widgets
vbox = widgets.VBox([label, text_input, int_slider])
display(vbox)

#  un objet Output pour afficher les résultats
output = widgets.Output()
display(output)

# bouton pour déclencher la recherche
button = widgets.Button(description="Rechercher")
display(button)

#  la fonction de recherche
def on_button_clicked(b):
    with output:
        clear_output()
        query = text_input.value
        top_n = int_slider.value
        results = search_engine.search(query, top_n=top_n)
        for result in results:
            print(result)

button.on_click(on_button_clicked)

# Tester l'interface
#  le nombre de documents à retourner

VBox(children=(Label(value='Moteur de recherche de discours US'), Text(value='', description='Mots-clés:'), In…

Output()

Button(description='Rechercher', style=ButtonStyle())

# Analyse comparative et temporelle

In [6]:
#  fonction pour comparer deux corpus
def compare_corpora(corpus1, corpus2, query):
    results1 = corpus1.search(query)
    results2 = corpus2.search(query)
    common = set([doc.texte for doc in results1]) & set([doc.texte for doc in results2])
    specific1 = set([doc.texte for doc in results1]) - common
    specific2 = set([doc.texte for doc in results2]) - common
    return common, specific1, specific2

# Exemple d'utilisation
corpus1 = Corpus('Corpus 1')
corpus2 = Corpus('Corpus 2')
# Ajout des documents aux corpus1 et corpus2
# 
common, specific1, specific2 = compare_corpora(corpus1, corpus2, 'president')
print("Common:", common)
print("Specifique a corpus1:", specific1)
print("Specifique a corpus2:", specific2)

#  fonction pour analyser l'évolution temporelle d'un mot
def temporal_analysis(corpus, query):
    results = corpus.search(query)
    date_counts = {}
    for doc in results:
        date = doc.date.split('-')[0]  #        if date not in date_counts:
            date_counts[date] = 0
        date_counts[date] += 1
    return date_counts

# Exemple d'utilisation
date_counts = temporal_analysis(corpus, 'president')
print(date_counts)

Common: set()
Specific to corpus1: set()
Specific to corpus2: set()
{'2015/04/12': 1, '2015/04/14': 1, '2015/04/20': 1, '2015/05/05': 1, '2015/05/18': 1, '2015/05/19': 1, '2015/05/20': 1, '2015/05/27': 1, '2015/06/04': 1, '2015/06/13': 1, '2015/06/15': 1, '2015/06/16': 1, '2015/07/12': 1, '2015/07/13': 1, '2015/09/09': 1, '2015/09/17': 1, '2015/09/20': 1, '2015/09/27': 1, '2015/10/04': 1, '2015/10/16': 1, '2015/10/18': 1, '2015/11/19': 1, '2015/12/01': 1, '2015/12/06': 1, '2015/12/15': 1, '2015/12/29': 1, '2016/01/05': 1, '2016/01/10': 1, '2016/01/13': 1, '2016/01/17': 2, '2016/01/24': 1, '2016/01/26': 1, '2016/01/27': 1, '2016/01/31': 1, '2016/02/01': 2, '2016/02/02': 2, '2016/02/03': 1, '2016/02/07': 2, '2016/02/08': 1, '2016/03/21': 2, '2016/06/02': 1, '2016/06/13': 1, '2016/06/22': 1, '2016/07/16': 1, '2016/07/21': 1, '2016/07/27': 1, '2016/07/28': 1, '2016/08/05': 1, '2016/08/08': 1, '2016/08/09': 1, '2016/08/15': 1, '2016/08/19': 1, '2016/08/24': 1, '2016/08/31': 1, '2016/09/01':