In [1]:
import os
import json
import nltk
import string
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import KFold
from nltk.corpus import stopwords





In [4]:
def parcourir_repertoire(repertoire):
    dfs = []
    for dossier, sous_repertoires, fichiers in os.walk(repertoire):
        for fichier in fichiers:
            if fichier.endswith('.json'):
                chemin_fichier = os.path.join(dossier, fichier)
                
                with open(chemin_fichier, 'r') as f:
                    contenu = json.load(f)
                    # Récupération de la description et de la catégorie
                    description = contenu.get('description', None)
                    categorie = os.path.basename(os.path.dirname(chemin_fichier))
                    
                    if description is not None:
                        # Création d'un DataFrame avec description et categorie comme colonnes
                        df = pd.DataFrame({'Description': [description], 'Categorie': [categorie]})
                    
                        dfs.append(df)             
    return dfs


In [5]:
# Chemin du répertoire à parcourir
repertoire_a_explorer = './brevets_alternants'
dfs = parcourir_repertoire(repertoire_a_explorer)


[                                         Description Categorie
0  [[0001]    The present invention relates to a ...    CHIMIE]


In [13]:
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
    # Convertir la liste en une chaîne de caractères et supprimer les formats spécifiques, les caractères indésirables et la ponctuation
    cleaned_text = ' '.join(text)
    cleaned_text = re.sub(r'\[\d+\]|[\n\"]| \d+\.\s*|[%s]' % re.escape(''.join(['.', ',', '!', '?', ';', ':', '-', '(', ')'])), '', cleaned_text)
    
    # Convertir en minuscules
    cleaned_text = cleaned_text.lower()
    
    # Supprimer les espaces en début et fin de chaîne et réduire les espaces consécutifs à un seul espace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    # Supprimer les stopwords
    stop_words = set(stopwords.words('english'))
    cleaned_text = ' '.join(word for word in cleaned_text.split() if word not in stop_words)
    
    # Tokenization
    tokens = word_tokenize(cleaned_text)
     # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    
    return lemmatized_tokens


[nltk_data] Downloading package stopwords to /home/catech/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/catech/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# Nettoyer chaque cellule de chaque DataFrame dans la liste dfs
for df in dfs:
    # Afficher la colonne 'Description' avant le nettoyage
    print("Colonne 'Description' avant nettoyage :")
    for desc in df['Description']:
        print(desc)
    
    # Nettoyer le DataFrame
    df_cleaned = df.applymap(clean_text)

   # Afficher la colonne 'Description' après le nettoyage
    print("Colonne 'Description' après nettoyage :")
    for desc in df_cleaned['Description']:
        print(desc)

Colonne 'Description' avant nettoyage :
['[0001]    The present invention relates to a method of making a support material composition, to a support material composition, and to the use of the support material composition as a nitrogen oxide storage component applicable in catalysts for treating exhaust gases, for example from lean-burn engines.', 'BACKGROUND:', '[0002]    To abate the NOx content in the exhaust gas of lean-burn gasoline or diesel engines, designated NOx after-treatment systems are required. This is because the reduction of NOx to N2in a three-way catalyst operating under the prevailing oxidizing conditions is not possible. Therefore, a special exhaust gas after-treatment catalyst, has been developed containing a material that is able to store NOx, e.g. as a nitrate for example under lean conditions. By applying short stoichiometric or rich operation conditions the stored NOx can then be converted to nitrogen and the storage material regenerated. This catalyst is commo

  df_cleaned = df.applymap(clean_text)


Colonne 'Description' après nettoyage :
['present', 'invention', 'relates', 'method', 'making', 'support', 'material', 'composition', 'support', 'material', 'composition', 'use', 'support', 'material', 'composition', 'nitrogen', 'oxide', 'storage', 'component', 'applicable', 'catalyst', 'treating', 'exhaust', 'gas', 'example', 'leanburn', 'engine', 'background', 'abate', 'nox', 'content', 'exhaust', 'gas', 'leanburn', 'gasoline', 'diesel', 'engine', 'designated', 'nox', 'aftertreatment', 'system', 'required', 'reduction', 'nox', 'n2in', 'threeway', 'catalyst', 'operating', 'prevailing', 'oxidizing', 'condition', 'possible', 'therefore', 'special', 'exhaust', 'gas', 'aftertreatment', 'catalyst', 'developed', 'containing', 'material', 'able', 'store', 'nox', 'eg', 'nitrate', 'example', 'lean', 'condition', 'applying', 'short', 'stoichiometric', 'rich', 'operation', 'condition', 'stored', 'nox', 'converted', 'nitrogen', 'storage', 'material', 'regenerated', 'catalyst', 'commonly', 'called