# Partie 1 : Démarrage

In [1]:
import pandas as pd
from Corpus import Corpus, Document
import logging

# Configuration du logging
logging.basicConfig(level=logging.INFO)

# Charger le fichier CSV
df = pd.read_csv('c:/Users/bilar/Desktop/Projet-Python/V3/discours_US.csv', delimiter='\t')
print(df.head())

# Distribution des auteurs
author_distribution = df['speaker'].value_counts()
print(author_distribution)

# Créer un objet Corpus
corpus = Corpus('Discours US')

# Ajouter les documents au corpus
for index, row in df.iterrows():
    text = row['text']
    author = row['speaker']
    date = row['date']
    sentences = text.split('. ')
    for sentence in sentences:
        doc = Document(sentence, author, date)
        corpus.add(doc)

# 1.4 Tester les fonctions search et concorde
print([doc.text for doc in corpus.search('when')])
print(corpus.concorde('when', context_size=30))

# Partie 2 : Utilisation de votre moteur de recherche

In [2]:
from SearchEngine import SearchEngine

# 2.1 Importer et initialiser la classe SearchEngine
search_engine = SearchEngine(corpus)

# 2.2 Tester la fonction search
results = search_engine.search('freedom', top_n=10)

# Convert results to dense array if they are sparse
if hasattr(results, 'toarray'):
    results = results.toarray()

print(results)

# Partie 3 : Petite interface

In [3]:
import ipywidgets as widgets
from IPython.display import display

# Créer les widgets
label = widgets.Label(value="Moteur de recherche")
text_input = widgets.Text(description="Mots clés:")
int_slider = widgets.IntSlider(description="Nombre de documents:", min=1, max=10, value=5)

# Afficher les widgets
display(label, text_input, int_slider)

# Utiliser VBox et HBox pour l'affichage
vbox = widgets.VBox([label, text_input, int_slider])
display(vbox)

# Créer un objet Output
output = widgets.Output()
display(output)

# Ajouter un objet Button
button = widgets.Button(description="Rechercher")
display(button)

# Écrire la fonction clique
def on_button_click(b):
    with output:
        output.clear_output()
        query = text_input.value
        num_docs = int_slider.value
        results = search_engine.search(query, top_n=num_docs)
        for result in results:
            print(result.text)

button.on_click(on_button_click)

# Tester l'interface en affichant tous les éléments
ui = widgets.VBox([label, text_input, int_slider, button, output])
display(ui)

# Ajouter des filtres supplémentaires
author_filter = widgets.Text(description="Auteur:")
date_filter = widgets.Text(description="Date:")

# Mettre à jour la fonction clique pour inclure les filtres
def on_button_click_with_filters(b):
    with output:
        output.clear_output()
        query = text_input.value
        num_docs = int_slider.value
        author = author_filter.value
        date = date_filter.value
        results = search_engine.search_with_filters(query, author=author, date=date, top_n=num_docs)
        for result in results:
            print(result.text)

button.on_click(on_button_click_with_filters)

# Afficher l'interface avec les filtres
ui_with_filters = widgets.VBox([label, text_input, int_slider, author_filter, date_filter, button, output])
display(ui_with_filters)

# Analyse comparative et temporelle

In [4]:
# Ajouter une fonction pour comparer deux corpus
def compare_corpora(corpus1, corpus2, query):
    results1 = corpus1.search(query)
    results2 = corpus2.search(query)
    common = set([doc.text for doc in results1]) & set([doc.text for doc in results2])
    specific1 = set([doc.text for doc in results1]) - common
    specific2 = set([doc.text for doc in results2]) - common
    return common, specific1, specific2

# Exemple d'utilisation
corpus1 = Corpus('Corpus 1')
corpus2 = Corpus('Corpus 2')
# Ajouter des documents aux corpus1 et corpus2
# ...
common, specific1, specific2 = compare_corpora(corpus1, corpus2, 'president')
print("Common:", common)
print("Specific to corpus1:", specific1)
print("Specific to corpus2:", specific2)

# Ajouter une fonction pour analyser l'évolution temporelle d'un mot
def temporal_analysis(corpus, query):
    results = corpus.search(query)
    date_counts = {}
    for doc in results:
        date = doc.date.split('-')[0]  # Extraire l'année
        if date not in date_counts:
            date_counts[date] = 0
        date_counts[date] += 1
    return date_counts

# Exemple d'utilisation
date_counts = temporal_analysis(corpus, 'president')
print(date_counts)