In [1]:
import nltk
import collections
from collections import Counter

In [2]:
def indexSplitPorter(doc):
    termes = doc.split()
    MotsVides = nltk.corpus.stopwords.words('english')
    TermesSansMotsVides = [terme for terme in termes if terme.lower() not in MotsVides]
    Porter = nltk.PorterStemmer()
    TermesNormalisation = [Porter.stem(terme) for terme in TermesSansMotsVides]
    TermesFrequence = Counter(TermesNormalisation)
    max_frequency = max(TermesFrequence.values())
    TermesPoids = [(terme, frequence) for terme, frequence in TermesFrequence.items()]
    return TermesPoids

In [3]:
def indexSplitLancester(doc):
    termes = doc.split()
    MotsVides = nltk.corpus.stopwords.words('english')
    TermesSansMotsVides = [terme for terme in termes if terme.lower() not in MotsVides]
    Lancaster = nltk.LancasterStemmer()
    TermesNormalisation = [Lancaster.stem(terme) for terme in TermesSansMotsVides]
    TermesFrequence = Counter(TermesNormalisation)
    max_frequency = max(TermesFrequence.values())
    TermesPoids = [(terme, frequence) for terme, frequence in TermesFrequence.items()]
    return TermesPoids

In [4]:
def indexTokenPorter(doc):
    ExpReg = nltk.RegexpTokenizer('(?:[A-Za-z]\.)+|[A-Za-z]+[\-@]\d+(?:\.\d+)?|\d+[A-Za-z]+|\d+(?:[\.\,]\d+)?%?|\w+(?:[\-/]\w+)*')
    termes = ExpReg.tokenize(doc)
    MotsVides = nltk.corpus.stopwords.words('english')
    TermesSansMotsVides = [terme for terme in termes if terme.lower() not in MotsVides]
    Porter = nltk.PorterStemmer()
    TermesNormalisation = [Porter.stem(terme) for terme in TermesSansMotsVides]
    TermesFrequence = Counter(TermesNormalisation)
    max_frequency = max(TermesFrequence.values())
    TermesPoids = [(terme, frequence) for terme, frequence in TermesFrequence.items()]
    return TermesPoids

In [5]:
def indexTokenLancester(doc):
    ExpReg = nltk.RegexpTokenizer('(?:[A-Za-z]\.)+|[A-Za-z]+[\-@]\d+(?:\.\d+)?|\d+[A-Za-z]+|\d+(?:[\.\,]\d+)?%?|\w+(?:[\-/]\w+)*')
    termes = ExpReg.tokenize(doc)
    MotsVides = nltk.corpus.stopwords.words('english')
    TermesSansMotsVides = [terme for terme in termes if terme.lower() not in MotsVides]
    Lancaster = nltk.LancasterStemmer()
    TermesNormalisation = [Lancaster.stem(terme) for terme in TermesSansMotsVides]
    TermesFrequence = Counter(TermesNormalisation)
    max_frequency = max(TermesFrequence.values())
    TermesPoids = [(terme, frequence) for terme, frequence in TermesFrequence.items()]
    return TermesPoids

In [6]:
def createDescriptionFile(numdoc,doc,methode):
    output_file = f"Resultats/Descripteur{methode}.txt"
    method_function = globals().get(methode)
    if method_function:
        Terms = method_function(doc)
    else:
        raise ValueError("Invalid method name")
    with open(output_file, 'a') as output:
        for term in Terms:
            term_text = term[0]  
            term_frequency = term[1]
            output.write(f'{numdoc} {term_text} {term_frequency}\n')

In [7]:
def createInverseFile(numdoc,doc,methode):
    output_file = f"Resultats/Inverse{methode}.txt"
    method_function = globals().get(methode)
    if method_function:
        Terms = method_function(doc)
    else:
        raise ValueError("Invalid method name")
    with open(output_file, 'a') as output:
        for term in Terms:
            term_text = term[0]  
            term_frequency = term[1]
            output.write(f'{term_text} {term_frequency} {numdoc} \n')

In [8]:
import pandas as pd
def get_document_content(path):
        doc = pd.read_csv(path, sep=',', skipinitialspace=True) 
        return doc

In [9]:
Methodes = ["indexSplitPorter","indexSplitLancester","indexTokenPorter","indexTokenLancester"]
documents = get_document_content("./Collection/documents.csv")
N = len(documents)
for i  in range(0, N):
    document_content = str(documents.iloc[i]['title']) + ' ' + str(documents.iloc[i]['text'])
    num = documents.iloc[i]['doc_num']
    for methode in Methodes:
        createDescriptionFile(num,document_content,methode)
        createInverseFile(num,document_content,methode)

In [10]:
from collections import defaultdict
import math

def createIndexesPondere(index_file, num_docs):
    with open(index_file, 'r') as file:
        lines = file.readlines()

    modified_lines = []

    term_frequency = defaultdict(int)
    document_frequency = defaultdict(int)
    max_term_frequency = 0  

    term_frequency = defaultdict(int)
    document_frequency = defaultdict(int)
    max_term_frequency = defaultdict(int)

    for line in lines:
        parts = line.split()
        if len(parts) >= 3:
            numdoc, term, term_freq = parts[:3]
            numdoc = int(numdoc)
            term_freq = int(term_freq)
            term_frequency[(numdoc, term)] += term_freq
            document_frequency[term] += 1
            max_term_frequency[numdoc] = max(max_term_frequency.get(numdoc, 0), term_frequency[(numdoc, term)])

    for line in lines:
        parts = line.split()
        if len(parts) >= 3:
            numdoc, term, term_freq = parts[:3]
            numdoc = int(numdoc)
            term_freq = int(term_freq)

            max_freq_in_doc = max_term_frequency.get(numdoc, 1) 
            weight = (term_freq / max_freq_in_doc) * math.log10(num_docs / document_frequency[term] + 1)

            formatted_weight = f"{weight:.4f}"
            modified_line = f"{numdoc} {term} {term_freq} {formatted_weight}\n"
            modified_lines.append(modified_line)

    with open(index_file, 'w') as output_file:
        output_file.writelines(modified_lines)
def createInversePondere(index_file, num_docs):
    with open(index_file, 'r') as file:
        lines = file.readlines()

    modified_lines = []

    term_frequency = defaultdict(int)
    document_frequency = defaultdict(int)
    max_term_frequency = 0  

    term_frequency = defaultdict(int)
    document_frequency = defaultdict(int)
    max_term_frequency = defaultdict(int)

    for line in lines:
        parts = line.split()
        if len(parts) >= 3:
            term, term_freq, numdoc = parts[:3]
            numdoc = int(numdoc)
            term_freq = int(term_freq)
            term_frequency[(numdoc, term)] += term_freq
            document_frequency[term] += 1
            max_term_frequency[numdoc] = max(max_term_frequency.get(numdoc, 0), term_frequency[(numdoc, term)])

    for line in lines:
        parts = line.split()
        if len(parts) >= 3:
            term, term_freq, numdoc = parts[:3]
            numdoc = int(numdoc)
            term_freq = int(term_freq)

            max_freq_in_doc = max_term_frequency.get(numdoc, 1)  # Default to 1 if the document has no terms
            weight = (term_freq / max_freq_in_doc) * math.log10(num_docs / document_frequency[term] + 1)

            formatted_weight = f"{weight:.4f}"
            modified_line = f"{term} {term_freq} {numdoc} {formatted_weight}\n"
            modified_lines.append(modified_line)

    with open(index_file, 'w') as output_file:
        output_file.writelines(modified_lines)

index_files = ['Resultats/DescripteurindexSplitLancester.txt','Resultats/DescripteurindexSplitPorter.txt','Resultats/DescripteurindexTokenLancester.txt','Resultats/DescripteurindexTokenPorter.txt']
for index in index_files:
    createIndexesPondere(index,5999)

inverse_files = ['Resultats/InverseindexSplitLancester.txt','Resultats/InverseindexSplitPorter.txt','Resultats/InverseindexTokenLancester.txt','Resultats/InverseindexTokenPorter.txt']
for index in inverse_files:
    createInversePondere(index,5999)