In [204]:
import os
import re
import PyPDF2
import docx
from datetime import datetime
import getpass  # Module for getting the username
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import shutil
import csv

## Préparation de datasets de références qui servira comme vocabultaire pour chaque catégorie

In [205]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def read_csv(file_path):
    words = []
    with open(file_path, 'r', newline='') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        for row in csv_reader:
            word =lemmatizer.lemmatize(row['Word'].lower())
            words.append(word)
    return words

sports_words = read_csv('datasets/sports.csv')
cooking_words = read_csv('datasets/cooking.csv')
travel_words = read_csv('datasets/travel.csv')

## enrichissement des datasets par le vocabulaire de nltk

In [206]:
syns = wordnet.synsets("sports")
sports = []
for s in syns:
    for l in s.lemmas():
        sports.append(stemmer.stem(l.name()))
sports_dataset = set(sports)
sports_dataset.update(set(sports_words))

syns = wordnet.synsets("cooking")
cooking = []
for s in syns:
    for l in s.lemmas():
        cooking.append(stemmer.stem(l.name()))
cooking_dataset = set(cooking)
cooking_dataset.update(set(cooking_words))
syns = wordnet.synsets("travel")
travel = []
for s in syns:
    for l in s.lemmas():
        travel.append(stemmer.stem(l.name()))
travel_dataset = set(travel)
travel_dataset.update(set(travel_words))



# File Manager qui permet de lire, classifier les fichier ainsi que les indexer

In [207]:

class FileManager:
    def __init__(self):
        self.inverse_index = {}
        self.time_index = {}
        self.owner_index = {}
        self.stem = PorterStemmer()

    def upload_file(self, file_path):
        full_text = ""
        if file_path.endswith(".pdf"):
            with open(file_path, "rb") as file:
                if file_path.endswith(".pdf"):
                    pdfReader = PyPDF2.PdfFileReader(file)
                    for pageNum in range(pdfReader.numPages):
                        pageObj = pdfReader.getPage(pageNum)
                        text = pageObj.extractText()
                        full_text += text
        elif file_path.endswith(".txt"):
            with open(file_path, "r",encoding="utf-8") as file:
                full_text = file.read()
        elif file_path.endswith(".docx"):
            document = docx.Document(file_path)
            for paragraph in document.paragraphs:
                full_text += paragraph.text
        return full_text

    def index_file_by_date(self, file_path):
        # Get the creation time of the file
        timestamp = os.path.getctime(file_path)
        file_time = datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")

        # Indexing the file by date and time
        if file_time not in self.time_index:
            self.time_index[file_time] = set()
        self.time_index[file_time].add(file_path)

    def index_file_by_owner(self, file_path):
        # Get the owner of the file
        uid = os.stat(file_path).st_uid
        # Get the owner's username
        owner = getpass.getuser()
        # Indexing the file by owner
        if owner not in self.owner_index:
            self.owner_index[owner] = set()
        self.owner_index[owner].add(file_path)
        return owner

    def inverse_index_files(self):
        # read the files in the docs folder
        for root, dirs, files in os.walk("docs"):
            for file in files:
                file_path = os.path.join(root, file)
                text = self.upload_file(file_path)
                tokens = self.tokenize(text)
                # inverse index the file
                for token in tokens:
                    if token not in self.inverse_index:
                        self.inverse_index[token] = set()
                    self.inverse_index[token].add(file_path.split("\\")[-1])
        return self.inverse_index

    def search_keyword(self, keyword):
        if keyword.lower() in self.inverse_index:
            return self.inverse_index[keyword.lower()]
        else:
            return []

    def tokenize(self, text):
        # Load the list of stop words
        stopword = stopwords.words("english")
        # Tokenize the text
        tokens = word_tokenize(text)
        # Convert the tokens into lower case
        tokens = [token.lower() for token in tokens]
        # Remove the stop words and empty spaces
        tokens = [token for token in tokens if token not in stopword and token != ""]

        # Remove the punctuation
        tokens = [re.sub(r"[^a-zA-Z0-9]", "", token) for token in tokens]
        # stem the tokens
        lemmatizer = WordNetLemmatizer()
        tokens = [stemmer.stem(token) for token in tokens]
        return tokens

    def word_occ_positions(self, tokens):
        word_position = {}
        for index, word in enumerate(tokens):
            if word not in word_position:
                word_position[word] = []
            word_position[word].append(index)
        return word_position

    def search_keyword_association(self, keywords):
        # Get the files that contain the keywords
        files = []
        for keyword in keywords:
            key = self.search_keyword(stemmer.stem(keyword))
            for k in key:
                files.append(k)

        # Get the intersection of the files
        print(files)
        files = set(files)
        return files

    def classify_file(self, file_path):
        uploaded_file = self.upload_file(file_path)
        tokens = self.tokenize(uploaded_file)
        word_occ_positions = self.word_occ_positions(tokens)
        max_section = self.evaluate_text(word_occ_positions)
        # if (max_section ==''):
        #     rand = ['sports','cooking','travel']
        #     max_section = rand[random.randint(0,2)]
        file = os.path.basename(file_path)
        shutil.copy2(file_path, f"docs/{max_section}/{file.split('.')[1]}/{file}")
    
    def evaluate_text(self,word_occ_positions):
        scores = {}
        sections = {
            "sports": sports_dataset,
            "cooking": cooking_dataset,
            "travel": travel_dataset,
        }
        max_score = 0
        max_section = ''
        for section in sections:
            scores[section] = 0
            for keyword in sections[section]:
                if keyword in word_occ_positions:
                    scores[section] += len(word_occ_positions[keyword])
    
            if scores[section] > max_score:
                max_score = scores[section]
                max_section = section
        return max_section
# Exemple d'utilisation
file_manager = FileManager()

test = file_manager.upload_file("gaming.txt")
index = file_manager.index_file_by_date("test.txt")
owner = file_manager.index_file_by_owner("file3.txt")

tokens = file_manager.tokenize(test)

print(tokens)
print(owner)



['titl', '', '', 'evolut', 'game', '', 'pixel', 'virtual', 'realiti', '', 'introduct', '', 'realm', 'entertain', '', 'industri', 'experienc', 'rapid', 'transform', 'growth', 'game', 'industri', '', 'humbl', 'begin', 'pixel', 'charact', 'arcad', 'screen', 'immers', 'world', 'virtual', 'realiti', '', 'game', 'evolv', 'multibilliondollar', 'global', 'phenomenon', '', 'articl', 'delv', 'fascin', 'journey', 'game', '', 'trace', 'evolut', 'decad', '', 'birth', 'game', '', 'stori', 'begin', 'earli', '1970', 'birth', 'arcad', 'game', '', 'game', 'like', 'pong', 'space', 'invad', 'introduc', 'world', 'addict', 'allur', 'electron', 'entertain', '', 'pixel', 'graphic', 'simpl', 'gameplay', 'earli', 'titl', 'set', 'foundat', 'come', '', 'rise', 'consol', '', 'technolog', 'advanc', '', 'game', 'experi', '', '1980', 'wit', 'rise', 'home', 'game', 'consol', '', 'icon', 'nintendo', 'entertain', 'system', '', 'ne', '', 'lead', 'way', '', 'suddenli', '', 'game', 'nt', 'confin', 'arcad', '', 'becam', 'co

In [208]:
# cette fonction determine la categorie du fichier et le classifie dans l'arborecence des dossiers
file_manager.classify_file("file2.txt")

# Evaluation du système de classification

In [219]:
import csv

def read_texts_from_csv(csv_file):
    texts = []
    labels = []
    with open(csv_file, 'r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        for row in csv_reader:
            text = row['Text']
            label = row['Category'].lower()
            labels.append(label)
            texts.append(text)
    return texts, labels

# Replace 'your_dataset.csv' with the actual filename of your CSV dataset
csv_file_path = 'dataset.csv'

# Read texts from the CSV dataset
texts_without_labels,labels = read_texts_from_csv(csv_file_path)

predictions = []
for text in texts_without_labels:
    tokens = file_manager.tokenize(text)
    # print(tokens)
    word_occ_positions = file_manager.word_occ_positions(tokens)
    max_section = file_manager.evaluate_text(word_occ_positions)
    predictions.append(max_section)

score = 0
# Step 6: Evaluate the accuracy of the model
for i in range(0,len(labels)):
    if labels[i] == predictions[i]:
        score+=1
score /= len(labels) 
print(score)
# draw confusion matrix 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
array = confusion_matrix(labels, predictions, labels=['sports', 'cooking', 'travel'])
df_cm = pd.DataFrame(array, index = ['sports', 'cooking', 'travel'],
                  columns = ['pred_sports', 'pred_cooking', 'pred_travel'])

df_cm

0.5777777777777777


Unnamed: 0,pred_sports,pred_cooking,pred_travel
sports,11,0,1
cooking,0,7,0
travel,1,3,8


# Recherche de Fichier 

In [220]:
file_manager.inverse_index_files()


{'titl': {'file2.txt',
  'file3_554.txt',
  'travel2299.txt',
  'voyage.txt',
  'voyager1.txt'},
 '': {'file2.txt',
  'file3_554.txt',
  'modern_travel.docx',
  'tigerwood__55.txt',
  'travel2299.txt',
  'voyage.txt',
  'voyager1.txt'},
 'art': {'file2.txt', 'file3_554.txt', 'travel2299.txt', 'voyager1.txt'},
 'joy': {'file2.txt', 'file3_554.txt', 'travel2299.txt', 'voyager1.txt'},
 'cook': {'file3_554.txt'},
 'culinari': {'file3_554.txt'},
 'journey': {'file2.txt',
  'file3_554.txt',
  'travel2299.txt',
  'voyage.txt',
  'voyager1.txt'},
 'flavor': {'file3_554.txt', 'voyage.txt'},
 'creation': {'file3_554.txt'},
 'introduct': {'file2.txt', 'file3_554.txt', 'travel2299.txt', 'voyager1.txt'},
 'often': {'file2.txt', 'file3_554.txt', 'travel2299.txt', 'voyager1.txt'},
 'regard': {'file3_554.txt'},
 'scienc': {'file3_554.txt'},
 'univers': {'file3_554.txt'},
 'languag': {'file2.txt',
  'file3_554.txt',
  'travel2299.txt',
  'voyage.txt',
  'voyager1.txt'},
 'transcend': {'file2.txt',
  'f

In [221]:
def search_by_keyword():
    keyword = input("Enter the keyword: ")
    word =stemmer.stem(keyword.lower())
    results = file_manager.search_keyword(word)
    if len(results) == 0:
        print("No results found")
    else:
        print(f"Found {len(results)} results:")
        for result in results:
            print(result)

search_by_keyword()

Found 5 results:
modern_travel.docx
voyager1.txt
file2.txt
voyage.txt
travel2299.txt


In [223]:
def search_by_keywords_association():
    keywords = input("Enter the keywords separated by a comma: ")
    keywords = keywords.split(",")
    results = file_manager.search_keyword_association(keywords)
    if len(results) == 0:
        print("No results found")
    else:
        print(f"Found {len(results)} results:")
        for result in results:
            print(result)

search_by_keywords_association()

['modern_travel.docx', 'voyager1.txt', 'file2.txt', 'voyage.txt', 'travel2299.txt']
Found 5 results:
modern_travel.docx
voyager1.txt
file2.txt
voyage.txt
travel2299.txt


In [214]:
def search_in_content():
    keyword = input("Enter the keyword: ")
    results = []
    for root, dirs, files in os.walk('docs'):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                file_content = file_manager.upload_file(file_path)
                if keyword.lower() in file_content.lower():
                        results.append(file_path.split("\\")[-1])
            except Exception as e:
                print(f"Erreur lors de la lecture du fichier {file_path}: {e}")
    return results
search_in_content()

['file3_554.txt',
 'tigerwood__55.txt',
 'modern_travel.docx',
 'file2.txt',
 'travel2299.txt',
 'voyage.txt',
 'voyager1.txt']