In [1]:
import os
import re
import PyPDF2
import docx
from datetime import datetime
import getpass  # Module for getting the username
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import shutil
import numpy as np
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# File Manager qui permet de lire, classifier les fichiers ainsi que les indexer

In [6]:
class FileManager:
    def __init__(self):
        self.inverse_index = {}
        self.time_index = {}
        self.owner_index = {}
        self.stem = PorterStemmer()
        self.files_postings = {}
        self.categories_postings = {}
        self.totalDocs = 0
        # Load the list of stop words
        self.stopword = stopwords.words("english")
        self.get_file_postings()
        self.get_categories_postings()
        self.index_files_by_time()
        self.index_files_by_owner()
        self.fileCategory = {}
        self.classify_files()
        print(self.fileCategory)

    def classify_files(self):
        for root, dirs, files in os.walk("docs"):
            for file in files:
                self.fileCategory[file] = os.path.basename(os.path.dirname(root))

    def index_files_by_time(self):
        # read the files in the docs folder
        for root, dirs, files in os.walk("docs"):
            for file in files:
                file_path = os.path.join(root, file)
                self.index_file_by_date(file_path)

    def index_files_by_owner(self):
        for root, dirs, files in os.walk("docs"):
            for file in files:
                file_path = os.path.join(root, file)
                self.index_file_by_owner(file_path)

    def upload_file(self, file_path):
        full_text = ""
        if file_path.endswith(".pdf"):
            with open(file_path, "rb") as file:
                if file_path.endswith(".pdf"):
                    pdfReader = PyPDF2.PdfFileReader(file)
                    for pageNum in range(pdfReader.numPages):
                        pageObj = pdfReader.getPage(pageNum)
                        text = pageObj.extractText()
                        full_text += text
        elif file_path.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as file:
                full_text = file.read()
        elif file_path.endswith(".docx"):
            document = docx.Document(file_path)
            for paragraph in document.paragraphs:
                full_text += paragraph.text
        return full_text

    def get_categories_postings(self):
        # read the files in the docs folder
        folders = os.listdir("docs")
        for folder in folders:
            self.categories_postings[folder] = {}
            for root, dirs, files in os.walk(f"docs/{folder}"):
                for file in files:
                    file_path = os.path.join(root, file)
                    text = self.upload_file(file_path)
                    tokens = self.tokenize(text)
                    for token in tokens:
                        if token not in self.categories_postings[folder]:
                            self.categories_postings[folder][token] = 1
                        else:
                            self.categories_postings[folder][token] += 1
        return self.categories_postings

    def get_file_postings(self):
        # read the files in the docs folder
        for root, dirs, files in os.walk("docs"):
            for file in files:
                self.totalDocs += 1
                file_path = os.path.join(root, file)
                text = self.upload_file(file_path)
                tokens = self.tokenize(text)
                file_path = file_path.split("\\")[-1]
                self.files_postings[file_path] = {}
                for token in tokens:
                    if token not in self.files_postings[file_path]:
                        self.files_postings[file_path][token] = 1
                    else:
                        self.files_postings[file_path][token] += 1
        print(self.files_postings)

    def index_file_by_date(self, file_path):
        # Get the creation time of the file
        timestamp = os.path.getctime(file_path)
        file_time = datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
        # Indexing the file by date and time
        if file_time not in self.time_index:
            self.time_index[file_time] = set()
        self.time_index[file_time].add(file_path)

    def index_file_by_owner(self, file_path):
        # Get the owner of the file
        uid = os.stat(file_path).st_uid
        # Get the owner's username
        owner = getpass.getuser()
        # Indexing the file by owner
        if owner not in self.owner_index:
            self.owner_index[owner] = set()
        self.owner_index[owner].add(file_path)
        return owner

    def inverse_index_files(self):
        # read the files in the docs folder
        for root, dirs, files in os.walk("docs"):
            for file in files:
                file_path = os.path.join(root, file)
                text = self.upload_file(file_path)
                tokens = self.tokenize(text)
                # inverse index the file
                for token in tokens:
                    if token not in self.inverse_index:
                        self.inverse_index[token] = set()
                    self.inverse_index[token].add(file_path.split("\\")[-1])
        return self.inverse_index

    def search_keyword(self, keyword):
        keyword = keyword.lower()
        token = self.stem.stem(keyword)
        scores = {}
        idf = 0
        # calculate idf
        for keyowrd in self.inverse_index:
            if keyowrd == token:
                idf = np.log(self.totalDocs / (self.inverse_index[keyowrd]))
                break
        for file in self.files_postings:
            if token in self.files_postings[file]:
                tf = self.files_postings[file][token]
                scores[file] = tf * idf
        sorted_list = []
        # construst a list of the files sorted by score
        for file in scores:
            sorted_list.append([file, scores[file]])
        sorted_list = sorted(sorted_list, key=lambda x: x[1], reverse=True)
        new_list = [item[0] for item in sorted_list[:2]]
        return new_list

    def tokenize(self, text):
        # Tokenize the text
        tokens = word_tokenize(text)
        # Convert the tokens into lower case
        tokens = [token.lower() for token in tokens]
        # Remove the stop words and empty spaces
        tokens = [
            token for token in tokens if token not in self.stopword and len(token) > 0
        ]
        # Remove the punctuation
        tokens = [re.sub(r"[^a-zA-Z0-9]", "", token) for token in tokens]
        # stem the tokens
        # lemmatizer = WordNetLemmatizer()
        tokens = [stemmer.stem(token) for token in tokens]
        return tokens

    def word_occ_positions(self, tokens):
        word_position = {}
        for index, word in enumerate(tokens):
            if word not in word_position:
                word_position[word] = []
            word_position[word].append(index)
        return word_position

    def search_keyword_association(self, keywords):
        # Get the files that contain the keywords
        fileScores = {}

        tokens = self.tokenize(keywords)
        word_occ_positions = self.word_occ_positions(tokens)
        for root, dirs, files in os.walk("docs"):
            for file in files:
                fileScores[file] = 0
                self.fileCategory[file] = os.path.basename(os.path.dirname(root))
        for file in self.files_postings:
            for keyword in word_occ_positions:
                if keyword in self.files_postings[file]:
                    fileScores[file] += (
                        len(word_occ_positions[keyword])
                        * self.files_postings[file][keyword]
                    )
        sorted_list = []
        # construst a list of the files sorted by score
        for file in fileScores:
            sorted_list.append([file, fileScores[file]])
        sorted_list = sorted(sorted_list, key=lambda x: x[1], reverse=True)
        new_list = [item[0] for item in sorted_list[:2]]
        return new_list

    def classify_file(self, file_path):
        uploaded_file = self.upload_file(file_path)
        tokens = self.tokenize(uploaded_file)
        word_occ_positions = self.word_occ_positions(tokens)
        max_section = self.evaluate_text(word_occ_positions)
        
        file = os.path.basename(file_path)
        shutil.copy2(file_path, f"docs/{max_section}/{file.split('.')[-1]}/{file}")

    def evaluate_text(self, word_occ_positions):
        # formule de classification : score = (tf=n) * idf=(no) * 1
        scores = {}
        max_score = 0
        max_section = ""
        for section in self.categories_postings:
            scores[section] = 0
            for keyword in self.categories_postings[section]:
                if keyword in word_occ_positions:
                    scores[section] += (
                        len(word_occ_positions[keyword])
                        * self.categories_postings[section][keyword]
                    )
            if scores[section] > max_score:
                max_score = scores[section]
                max_section = section
        # print(scores)
        return max_section

In [7]:

# Exemple d'utilisation
file_manager = FileManager()


with open("time_index.txt", "w") as file:
    for time in file_manager.time_index:
        file.write(f"{time} {file_manager.time_index[time]}\n")

# Write owner_index into a file
with open("owner_index.txt", "w") as file:
    for owner in file_manager.owner_index:
        file.write(f"{owner} {file_manager.owner_index[owner]}\n")



{'cookingDoc.txt': {'word': 107, 'cook': 749, 'recip': 214, 'ingredi': 107, 'cuisin': 214, 'chef': 107, 'food': 963, 'flavor': 107, 'season': 107, 'spice': 107, 'herb': 107, 'bake': 107, 'grill': 107, 'roast': 107, 'fri': 107, 'saut': 107, 'simmer': 107, 'boil': 107, 'steam': 107, 'chop': 107, 'slice': 107, 'dice': 107, 'mix': 107, 'blend': 107, 'whisk': 107, 'stir': 107, 'knead': 107, 'measur': 107, 'tast': 214, 'prepar': 107, 'marin': 107, 'sear': 107, 'broil': 107, 'barbecu': 107, 'oven': 107, 'stove': 107, 'pan': 107, 'pot': 107, 'knife': 107, 'cut': 107, 'board': 107, 'utensil': 214, 'cookwar': 107, 'cookbook': 107, 'menu': 214, 'appet': 107, 'main': 107, 'cours': 107, 'dessert': 107, 'salad': 107, 'soup': 107, 'sauc': 107, 'marinad': 107, 'garnish': 107, 'present': 214, 'plate': 107, 'techniqu': 107, 'culinari': 321, 'homemad': 107, 'gourmet': 107, 'healthi': 107, 'nutriti': 107, 'comfort': 107, 'intern': 107, 'region': 107, 'dish': 107, 'show': 107, 'network': 107, 'foodi': 107,

# Classification de nouveaux documents dans notre arborescene

In [8]:
# cette fonction determine la categorie du fichier et le classifie dans l'arborecence des dossiers

file = input("Enter the file name: ")
file_manager.classify_file(file)


# Evaluation du système de classification

In [9]:
import csv

def read_texts_from_csv(csv_file):
    texts = []
    labels = []
    with open(csv_file, 'r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        for row in csv_reader:
            text = row['Text']
            label = row['Category'].lower()
            labels.append(label)
            texts.append(text)
    return texts, labels


csv_file_path = 'dataset.csv'

# Read texts from the CSV dataset
texts_without_labels,labels = read_texts_from_csv(csv_file_path)

predictions = []
for text in texts_without_labels:
    tokens = file_manager.tokenize(text)
    word_occ_positions = file_manager.word_occ_positions(tokens)
    max_section = file_manager.evaluate_text(word_occ_positions)
    predictions.append(max_section)

score = 0
# Step 6: Evaluate the accuracy of the model
for i in range(0,len(labels)):
    if labels[i] == predictions[i]:
        score+=1
score /= len(labels) 
print(f'notre precision de classification {score}')
# draw confusion matrix 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
array = confusion_matrix(labels, predictions, labels=['sports', 'cooking', 'travel'])
df_cm = pd.DataFrame(array, index = ['sports', 'cooking', 'travel'],
                  columns = ['pred_sports', 'pred_cooking', 'pred_travel'])

df_cm

notre precision de classification 0.9333333333333333


Unnamed: 0,pred_sports,pred_cooking,pred_travel
sports,13,0,2
cooking,0,14,0
travel,0,1,15


# Recherche de Fichiers 

In [13]:
def search_by_keyword():
    keyword = input("Enter the keyword: ")
    word =stemmer.stem(keyword.lower())
    results = file_manager.search_keyword(word)
    if len(results) == 0:
        print("No results found")
    else:
        print(f"Found {len(results)} results:")
        for result in results:
            print(result)

search_by_keyword()

Found 2 results:
supposed_travel.txt
travelDoc.txt


In [14]:
def search_by_keywords_association():
    keywords = input("Enter the keywords :")
    results = file_manager.search_keyword_association(keywords)
    if len(results) == 0:
        print("No results found")
    else:
        print(f"Found {len(results)} results:")
        for result in results:
            print(result)

search_by_keywords_association()

Found 2 results:
travelDoc.txt
file2.txt


In [15]:
def search_in_content():
    keyword = input("Enter the keyword: ")
    results = []
    for root, dirs, files in os.walk('docs'):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                file_content = file_manager.upload_file(file_path)
                if keyword.lower() in file_content.lower():
                        results.append(file_path.split("\\")[-1])
            except Exception as e:
                print(f"Erreur lors de la lecture du fichier {file_path}: {e}")
    return results
search_in_content()

['modern_travel.docx',
 'file2.txt',
 'supposed_travel.txt',
 'supposé_travel.txt',
 'travelDoc.txt']

# Evaluation des critères

In [17]:
csv_file_path = "dataset_for_query.csv"

# Read texts from the CSV dataset
texts_without_labels, labels = read_texts_from_csv(csv_file_path)

score = 0
predictions = []
i = 0
for text in texts_without_labels:
    max_sections = file_manager.search_keyword_association(text)
    local_score = 0
    for section in max_sections:
        if file_manager.fileCategory[section] == labels[i]:
            local_score += 1
    local_score = local_score / len(max_sections)
    if local_score >= 0.5:
        score += 1
    predictions.append(file_manager.fileCategory[max_sections[0]])
    i += 1

score = score / len(labels)
print(f"notre precision de recherche {score}")
# draw confusion matrix
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

array = confusion_matrix(labels, predictions, labels=["sports", "cooking", "travel"])
df_cm = pd.DataFrame(
    array,
    index=["sports", "cooking", "travel"],
    columns=["pred_sports", "pred_cooking", "pred_travel"],
)

df_cm

notre precision de recherche 0.85


Unnamed: 0,pred_sports,pred_cooking,pred_travel
sports,5,0,2
cooking,0,5,1
travel,1,0,6
