<a href="https://colab.research.google.com/github/NabilNkhili/Doc_Structur-s/blob/main/TP3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Indexer une collection de 9,804 documents sans utiliser de stop-words ni stemming.

-Temps total d'indexation.

-Nombre total de tokens (termes individuels) et de tokens distincts.

-Longueur moyenne des tokens distincts.

-Nombre total et distinct de termes après la normalisation.

-Longueur moyenne des documents.-

In [5]:
import time
from collections import defaultdict

def tokenize(content):
    return [word.lower() for word in content.split() if word.isalpha()]

def index_collection(file_path, chunk_size=1024 * 1024):
    index = defaultdict(list)
    total_tokens = 0
    total_chars = 0
    start_time = time.time()

    with open(file_path, 'r', encoding='utf-8') as file:
        while chunk := file.read(chunk_size):
            for doc_id, line in enumerate(chunk.splitlines(), start=1):
                terms = tokenize(line)
                total_tokens += len(terms)
                total_chars += sum(len(term) for term in terms)
                for term in terms:
                    if doc_id not in index[term]:
                        index[term].append(doc_id)

    total_time = time.time() - start_time
    return index, total_time, total_tokens, total_chars

file_path = "Text_Only_Ascii_Coll_NoSem"
index, total_time, total_tokens, total_chars = index_collection(file_path)
print(f"Temps d'indexation : {total_time}s")
print(f"Nombre total de tokens : {total_tokens}")
print(f"Nombre total de caractères : {total_chars}")
print(f"Nombre de termes distincts : {len(index)}")


Temps d'indexation : 3217.2785229682922s
Nombre total de tokens : 8871666
Nombre total de caractères : 46134689
Nombre de termes distincts : 163253


In [6]:
from nltk.stem import PorterStemmer

def remove_stop_words(index, stop_words_file):
    with open(stop_words_file, 'r', encoding='utf-8') as file:
        stop_words = set(word.strip() for word in file)
    return {term: postings for term, postings in index.items() if term not in stop_words}

def apply_stemming(index):
    stemmer = PorterStemmer()
    stemmed_index = defaultdict(list)
    for term, postings in index.items():
        stemmed_term = stemmer.stem(term)
        stemmed_index[stemmed_term].extend(postings)
    return stemmed_index

stop_words_file = "stop-words-english4.txt"
filtered_index = remove_stop_words(index, stop_words_file)
print(f"Index après suppression des mots vides : {len(filtered_index)} termes distincts.")

stemmed_index = apply_stemming(filtered_index)
print(f"Index après stemming : {len(stemmed_index)} termes distincts.")


Index après suppression des mots vides : 162641 termes distincts.
Index après stemming : 129837 termes distincts.


In [7]:
import math

def compute_smart_ltn(index, document_lengths):
    weights = defaultdict(dict)
    for term, postings in index.items():
        df = len(postings)
        idf = math.log10(len(document_lengths) / df) if df > 0 else 0
        for doc_id in postings:
            tf = index[term].count(doc_id)
            weight = (1 + math.log10(tf)) * idf if tf > 0 else 0
            weights[term][doc_id] = weight
    return weights

def compute_query_scores(query, weights, document_lengths):
    query_terms = tokenize(query)
    scores = defaultdict(float)
    for term in query_terms:
        if term in weights:
            for doc_id, weight in weights[term].items():
                scores[doc_id] += weight
    return sorted(scores.items(), key=lambda x: x[1], reverse=True)[:10]

document_lengths = {doc_id: 100 for doc_id in range(1, 10001)}
query = "web ranking scoring algorithm"
ltn_weights = compute_smart_ltn(stemmed_index, document_lengths)
top_10 = compute_query_scores(query, ltn_weights, document_lengths)
print("Top-10 documents avec SMART ltn :", top_10)


Top-10 documents avec SMART ltn : [(22952, 0.15069561188507175), (10803, 0.15069561188507175), (63837, 0.15069561188507175), (10822, 0.15069561188507175), (417, 0.11582793047608728), (5594, 0.11582793047608728), (7376, 0.11582793047608728), (14389, 0.11582793047608728), (14393, 0.11582793047608728), (14397, 0.11582793047608728)]


In [8]:
def compute_smart_ltc(index, document_lengths):
    weights = compute_smart_ltn(index, document_lengths)
    for doc_id in document_lengths.keys():
        norm = math.sqrt(sum((weights[term][doc_id] ** 2 for term in weights if doc_id in weights[term])))
        for term in weights:
            if doc_id in weights[term]:
                weights[term][doc_id] /= norm
    return weights

ltc_weights = compute_smart_ltc(stemmed_index, document_lengths)
top_10 = compute_query_scores(query, ltc_weights, document_lengths)
print("Top-10 documents avec SMART ltc :", top_10)


Top-10 documents avec SMART ltc : [(22952, 0.15069561188507175), (10803, 0.15069561188507175), (63837, 0.15069561188507175), (10822, 0.15069561188507175), (14389, 0.11582793047608728), (14393, 0.11582793047608728), (14397, 0.11582793047608728), (14406, 0.11582793047608728), (14778, 0.11582793047608728), (19344, 0.11582793047608728)]


In [None]:
def compute_bm25(index, document_lengths, avg_doc_length, k1=1.2, b=0.75):
    weights = defaultdict(dict)
    for term, postings in index.items():
        df = len(postings)  # Document frequency
        idf = math.log10((len(document_lengths) - df + 0.5) / (df + 0.5) + 1)
        for doc_id in postings:
            if doc_id not in document_lengths:
                document_lengths[doc_id] = 100  # Valeur par défaut
            tf = index[term].count(doc_id)
            doc_length = document_lengths[doc_id]
            numerator = tf * (k1 + 1)
            denominator = tf + k1 * (1 - b + b * (doc_length / avg_doc_length))
            weights[term][doc_id] = idf * (numerator / denominator)
    return weights

# Assurez-vous que document_lengths contient toutes les entrées
max_doc_id = max(doc_id for term in index for doc_id in index[term])
document_lengths = {doc_id: 100 for doc_id in range(1, max_doc_id + 1)}
avg_doc_length = sum(document_lengths.values()) / len(document_lengths)

bm25_weights = compute_bm25(stemmed_index, document_lengths, avg_doc_length)
top_10 = compute_query_scores(query, bm25_weights, document_lengths)
print("Top-10 documents avec BM25 :", top_10)
