# Exploration comparative des corpus
Ce notebook présente un outil permettant d'explorer deux corpus de documents avec une approche comparative et temporelle.

In [1]:
from collections import Counter
import math
from datetime import datetime

class Document:
    def __init__(self, text, date, speaker):
        self.text = text
        self.date = date
        self.speaker = speaker
        self.word_count = self._get_word_count()

    def _get_word_count(self):
        words = self.text.lower().split()
        return Counter(words)

class Corpus:
    def __init__(self, name):
        self.name = name
        self.documents = []

    def add(self, document):
        self.documents.append(document)

    def search(self, query):
        results = [doc for doc in self.documents if query.lower() in doc.text.lower()]
        return results

    def get_documents_by_date(self, start_date=None, end_date=None):
        if start_date and end_date:
            return [doc for doc in self.documents if start_date <= datetime.strptime(doc.date, '%Y-%m-%d') <= end_date]
        return self.documents

    def get_tf_idf(self, word):
        doc_freq = sum(1 for doc in self.documents if word in doc.word_count)
        tf_idf_scores = []
        for doc in self.documents:
            tf = doc.word_count[word] / sum(doc.word_count.values())
            idf = math.log(len(self.documents) / (doc_freq + 1))
            tf_idf_scores.append((doc, tf * idf))
        return tf_idf_scores

class SearchEngine:
    def __init__(self, corpus):
        self.corpus = corpus

    def search(self, query, top_n=10):
        results = self.corpus.search(query)
        return results[:top_n]

    def compare_corpora(self, other_corpus, query):
        results_self = self.corpus.search(query)
        results_other = other_corpus.search(query)
        common = set([doc.text for doc in results_self]) & set([doc.text for doc in results_other])
        specific_self = set([doc.text for doc in results_self]) - common
        specific_other = set([doc.text for doc in results_other]) - common
        return common, specific_self, specific_other


# Interface Utilisateur
Nous allons maintenant définir une interface utilisateur pour explorer les corpus et effectuer des analyses temporelles.

In [2]:
import ipywidgets as widgets
from IPython.display import display, clear_output

def search_interface(corpus1, corpus2):
    label = widgets.Label(value="Moteur de recherche de discours")
    query_input = widgets.Text(description="Mots-clés:")
    top_n_slider = widgets.IntSlider(description="Nombre de documents:", min=1, max=20, value=10)
    
    vbox = widgets.VBox([label, query_input, top_n_slider])
    display(vbox)
    
    output = widgets.Output()
    display(output)
    
    def on_button_clicked(b):
        with output:
            clear_output()
            query = query_input.value
            top_n = top_n_slider.value
            results_corpus1 = corpus1.search(query)
            results_corpus2 = corpus2.search(query)
            
            print(f"Résultats dans le corpus 1 ({len(results_corpus1)}):")
            for result in results_corpus1[:top_n]:
                print(result.text)
            
            print(f"\nRésultats dans le corpus 2 ({len(results_corpus2)}):")
            for result in results_corpus2[:top_n]:
                print(result.text)

    button = widgets.Button(description="Rechercher")
    button.on_click(on_button_clicked)
    display(button)

def temporal_analysis_interface(corpus):
    date_input = widgets.DatePicker(description="Date de début:")
    date_end_input = widgets.DatePicker(description="Date de fin:")
    word_input = widgets.Text(description="Mot:")
    
    vbox = widgets.VBox([date_input, date_end_input, word_input])
    display(vbox)
    
    output = widgets.Output()
    display(output)
    
    def on_button_clicked(b):
        with output:
            clear_output()
            start_date = date_input.value
            end_date = date_end_input.value
            word = word_input.value
            filtered_docs = corpus.get_documents_by_date(start_date=start_date, end_date=end_date)
            
            word_docs = [doc for doc in filtered_docs if word.lower() in doc.text.lower()]
            
            print(f"Documents contenant le mot '{word}':")
            for doc in word_docs:
                print(doc.text)

    button = widgets.Button(description="Analyser")
    button.on_click(on_button_clicked)
    display(button)


# Exemple d'Utilisation
Voici un exemple d'ajout de documents, de recherche et de comparaison entre les corpus.

In [3]:
# Exemple d'ajout de documents
corpus1 = Corpus('Corpus 1')
corpus2 = Corpus('Corpus 2')

doc1 = Document("The president spoke about freedom and democracy.", "2022-01-01", "Speaker 1")
doc2 = Document("Innovation and technology are key to progress.", "2023-01-01", "Speaker 2")
corpus1.add(doc1)
corpus1.add(doc2)

doc3 = Document("The economy is changing rapidly, impacting all sectors.", "2022-05-01", "Speaker 3")
corpus2.add(doc3)

# Test de la recherche et de la comparaison
search_engine = SearchEngine(corpus1)
print(search_engine.search("freedom", top_n=2))
common, specific_self, specific_other = search_engine.compare_corpora(corpus2, "technology")
print("Common Documents:", common)
print("Specific to Corpus 1:", specific_self)
print("Specific to Corpus 2:", specific_other)
