<a href="https://colab.research.google.com/github/NabilNkhili/Doc_Structur-s/blob/main/TP1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Exemple Python pour construire un index inversé et une matrice d'incidence

# Collection de documents (similaire à celle mentionnée dans le TP)
documents = {
    "D0": "Citizen Kane",
    "D1": "Casablanca",
    "D2": "The Godfather The Godfather",
    "D3": "Gone with the Wind",
    "D4": "Lawrence of Arabia",
    "D5": "The Wizard of Oz The Wizard of Oz",
    "D6": "The Graduate",
    "D7": "On the Waterfront",
    "D8": "Schindler's List",
    "D9": "Singin' in the Rain",
}

# Construction de l'index inversé
from collections import defaultdict
import pandas as pd

def build_inverted_index(docs):
    inverted_index = defaultdict(list)  # Initialise un dictionnaire avec des listes vides
    for doc_id, content in docs.items():  # Parcourt chaque document
        terms = content.lower().split()  # Conversion en minuscules et découpage en mots
        for term in terms:  # Parcourt chaque mot du document
            if doc_id not in inverted_index[term]:  # Évite les doublons
                inverted_index[term].append(doc_id)  # Ajoute le document au terme correspondant
    return inverted_index

# Création de l'index inversé
inverted_index = build_inverted_index(documents)

# Affichage de l'index inversé
print("Index inversé :")
for term, postings in inverted_index.items():
    print(f"{term}: {postings}")

# Construction de la matrice d'incidence
def build_incidence_matrix(docs, inverted_index):
    terms = sorted(inverted_index.keys())  # Trie les termes dans l'ordre alphabétique
    matrix = pd.DataFrame(0, index=terms, columns=docs.keys())  # Initialise une matrice remplie de zéros
    for term in terms:  # Parcourt chaque terme
        for doc_id in inverted_index[term]:  # Parcourt les documents associés au terme
            matrix.at[term, doc_id] = 1  # Marque 1 si le terme est présent dans le document
    return matrix


# Création de la matrice d'incidence
incidence_matrix = build_incidence_matrix(documents, inverted_index)

# Affichage de la matrice d'incidence
print("\nMatrice d'incidence :")
print(incidence_matrix)

# Exemple de recherche booléenne simple (AND)
def boolean_and(query_terms, inverted_index):
    result = set(inverted_index[query_terms[0]])
    for term in query_terms[1:]:
        result &= set(inverted_index.get(term, []))
    return sorted(result)

# Exemple de requête AND : "the AND wizard"
query = ["the", "wizard"]
result = boolean_and(query, inverted_index)
print(f"\nDocuments correspondant à la requête AND {query} : {result}")


Index inversé :
citizen: ['D0']
kane: ['D0']
casablanca: ['D1']
the: ['D2', 'D3', 'D5', 'D6', 'D7', 'D9']
godfather: ['D2']
gone: ['D3']
with: ['D3']
wind: ['D3']
lawrence: ['D4']
of: ['D4', 'D5']
arabia: ['D4']
wizard: ['D5']
oz: ['D5']
graduate: ['D6']
on: ['D7']
waterfront: ['D7']
schindler's: ['D8']
list: ['D8']
singin': ['D9']
in: ['D9']
rain: ['D9']

Matrice d'incidence :
             D0  D1  D2  D3  D4  D5  D6  D7  D8  D9
arabia        0   0   0   0   1   0   0   0   0   0
casablanca    0   1   0   0   0   0   0   0   0   0
citizen       1   0   0   0   0   0   0   0   0   0
godfather     0   0   1   0   0   0   0   0   0   0
gone          0   0   0   1   0   0   0   0   0   0
graduate      0   0   0   0   0   0   1   0   0   0
in            0   0   0   0   0   0   0   0   0   1
kane          1   0   0   0   0   0   0   0   0   0
lawrence      0   0   0   0   1   0   0   0   0   0
list          0   0   0   0   0   0   0   0   1   0
of            0   0   0   0   1   1   0   0   0