In [13]:
# Chargement minimal du CSV
import pandas as pd
from IPython.display import display
import csv

# Chargement du fichier CSV
df = pd.read_csv('./data/corpus_data.csv', sep='\t')

print(f"Fichier chargé : {len(df)} lignes.")
print("Colonnes :", df.columns.tolist())
display(df.head(3))


Fichier chargé : 462 lignes.
Colonnes : ['id', 'titre', 'auteur', 'date', 'url', 'texte', 'type', 'nb_comments', 'co_auteurs']


Unnamed: 0,id,titre,auteur,date,url,texte,type,nb_comments,co_auteurs
0,0,Software Engineering Podcasts & Conference Tal...,TechTalksWeekly,2025-12-04T08:24:20Z,https://redd.it/1pdtt4r,Hi [r/SoftwareEngineering](https://www.reddit....,Reddit,0,[]
1,1,Use case diagram generalization,Humble_Ad_7053,2025-12-04T11:06:34Z,https://redd.it/1pdwagz,It is not clear in UML 2.5.1 that generalizati...,Reddit,5,[]
2,2,How are you measuring developer velocity witho...,Black_0ut,2025-11-30T07:33:26Z,https://redd.it/1pabfba,Our leadership keeps asking for better visibil...,Reddit,41,[]


In [14]:
import ast
import pandas as pd
from models.Corpus import Corpus
from models.Document import RedditDocument, ArxivDocument

mon_corpus = Corpus(nom='Corpus de Software Engineering')

# Reconstruction des documents depuis le CSV
required_cols = {'titre','auteur','date','url','texte','type','nb_comments','co_auteurs'}
missing = sorted(list(required_cols - set(df.columns)))
if missing:
    raise ValueError('Colonnes manquantes dans le CSV: ' + ', '.join(missing))

for _, row in df.iterrows():
    doc_type = str(row.get('type', '')).strip()
    titre = str(row.get('titre', '')).strip()
    auteur = str(row.get('auteur', '')).strip()
    date = str(row.get('date', '')).strip()
    url = str(row.get('url', '')).strip()
    texte = str(row.get('texte', '')).strip()

    if doc_type.lower() == 'reddit':
        nb = row.get('nb_comments', 0)
        if pd.isna(nb):
            nb_comments = 0
        else:
            try:
                nb_comments = int(nb)
            except Exception:
                nb_comments = 0
        doc = RedditDocument(titre, auteur, date, url, texte, nb_comments)
    else:
        co = row.get('co_auteurs', '[]')
        if pd.isna(co) or str(co).strip() == '':
            co_auteurs = []
        else:
            s = str(co).strip()
            if s.startswith('['):
                try:
                    parsed = ast.literal_eval(s)
                    co_auteurs = parsed if isinstance(parsed, list) else []
                except Exception:
                    co_auteurs = []
            else:
                co_auteurs = [s]
        doc = ArxivDocument(titre, auteur, date, url, texte, co_auteurs=co_auteurs)

    mon_corpus.add_document(doc)

print(mon_corpus)
print('Docs:', len(mon_corpus.get_documents()))
print('Auteurs:', len(mon_corpus.get_authors()))


<Corpus(nom='Corpus de Software Engineering', docs=1848, auteurs=403, id_max=1847)>
Docs: 1848
Auteurs: 403


In [15]:
from models.SearchEngine import SearchEngine
from IPython.display import display

# Initialisation du moteur de recherche
engine = SearchEngine(mon_corpus)

# Tests rapides
print('\n=== Test 1 : "design" ===')
res1 = engine.search('design', n_results=5)
display(res1)

print('\n=== Test 2 : "testing" ===')
res2 = engine.search('testing', n_results=5)
display(res2)


-> Vocabulaire créé : 7723 mots.

=== Test 1 : "design" ===


                                                                 

Unnamed: 0,Document,Score,Auteur,Date,URL,Type
0,Is there any term in software engineering more...,0.4299,bkovitz,2025-01-15T18:02:58Z,https://redd.it/1i222pp,Reddit
1,Is there any term in software engineering more...,0.4299,bkovitz,2025-01-15T18:02:58Z,https://redd.it/1i222pp,Reddit
2,Is there any term in software engineering more...,0.4299,bkovitz,2025-01-15T18:02:58Z,https://redd.it/1i222pp,Reddit
3,Is there any term in software engineering more...,0.4299,bkovitz,2025-01-15T18:02:58Z,https://redd.it/1i222pp,Reddit
4,Qualitative analysis of the relationship betwe...,0.3931,Asif Imran,2023-10-22T23:21:13Z,http://arxiv.org/abs/2310.14449v1,Arxiv



=== Test 2 : "testing" ===


                                                                

Unnamed: 0,Document,Score,Auteur,Date,URL,Type
0,Testing Research Software: A Survey,0.4945,Nasir U. Eisty,2022-05-31T17:40:03Z,http://arxiv.org/abs/2205.15982v1,Arxiv
1,Testing Research Software: A Survey,0.4945,Nasir U. Eisty,2022-05-31T17:40:03Z,http://arxiv.org/abs/2205.15982v1,Arxiv
2,Testing Research Software: A Survey,0.4945,Nasir U. Eisty,2022-05-31T17:40:03Z,http://arxiv.org/abs/2205.15982v1,Arxiv
3,Testing Research Software: A Survey,0.4945,Nasir U. Eisty,2022-05-31T17:40:03Z,http://arxiv.org/abs/2205.15982v1,Arxiv
4,Software Testing with Large Language Models: S...,0.3375,Junjie Wang,2023-07-14T08:26:12Z,http://arxiv.org/abs/2307.07221v3,Arxiv


In [16]:
import ipywidgets as widgets
from IPython.display import display, clear_output

# Interface Graphique
style = {'description_width': 'initial'}

lbl_titre = widgets.HTML("<h2>Moteur de recherche Software Engineering</h2>")

# Champ texte pour les mots clés
txt_requete = widgets.Text(
    description='Mots clés :',
    placeholder='ex: javascript web',
    style=style,
)

# Slider pour le nombre de résultats
slider_nb = widgets.IntSlider(
    value=10,
    min=1,
    max=50,
    step=1,
    description="Nombre d'articles :",
    style=style,
)

# Bouton de recherche
btn_search = widgets.Button(
    description='Rechercher',
    button_style='info',
    icon='search',
)

# Zone d'affichage des résultats
out_resultat = widgets.Output()

# Fonction clique_bouton
def clique_bouton(b):
    mots = txt_requete.value
    n = slider_nb.value
    
    with out_resultat:
        clear_output() # On efface le résultat précédent
        
        if not mots.strip():
            print("Veuillez entrer des mots-clés.")
            return
            
        print(f"Recherche en cours pour : '{mots}' ({n} résultats)...")
        
        df_res = engine.search(mots, n_results=n)
        
        if df_res.empty:
            print("Aucun résultat trouvé.")
        else:
            display(df_res)

btn_search.on_click(clique_bouton)

# Mise en page
ui = widgets.VBox([
    lbl_titre,
    txt_requete,
    widgets.HBox([slider_nb, btn_search]),
    out_resultat,
])

# Affichage final
display(ui)


VBox(children=(HTML(value='<h2>Moteur de recherche Software Engineering</h2>'), Text(value='', description='Mo…