Processing des données avec stemming, suppression de stop-words et selection via DF/IDF

In [5]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from tqdm.gui import tqdm as tqdm_gui

Import des data via deserialize

In [6]:
def xmlToDf(xmlFile):
    # Read XML file
    df = pd.read_xml(xmlFile)
    #replace None to empty string in commentaire column
    df["commentaire"] = df["commentaire"].apply(checkIfWordInComment)
    return df


def checkIfWordInComment(comment):
    if comment is None:
        return ""
    return comment

df_dev_fast = xmlToDf("data/dev.xml")

In [7]:
# def deserializeDf(path):
#     with open(path, 'rb') as f:
#         return pickle.load(f)
    
# # df_dev = deserializeDf('data/df_dev.pkl')
# # df_idf = deserializeDf('data/df_idf.pkl')
# df_dev_fast = deserializeDf('data/df_dev_fast.pkl')
# #df_idf_dev_fast = deserializeDf('data/df_idf_dev_fast.pkl')

In [8]:

import nltk
from nltk.corpus import stopwords
import spacy

nltk.download('stopwords')
stopWords = set(stopwords.words('french'))
spacy.prefer_gpu()
# nlp = spacy.load("fr_dep_news_trf") # less efficient but more accurate
nlp = spacy.load("fr_core_news_sm") # more efficient but less accurate

stop_words = [" ", "l'", "l’", "la", "le", "les", "d’", "d'", "de", "du", "des", "une", "un",
                "ce", "ces", "je", "moi", "mon", "me", "mes", "tu", "toi", "ton", "te", "tes", 
                "il", "lui", "son", "se", "ses", "nous", "notre", "nos", "vous", "votre", "vos",
                "ils", "leur", "leurs", "n'", "ne", "tout", "être", "avoir", "deja", "déjà",
                "ou" ,"où", "qu’", "qu'", "que", "qui", "quelle", "quel", "quelles", "quels", 
                ".", ",", "...", "sur", "telle", "tel", "telles", "tels", "laquelle", "lequel",
                "laquelles", "lequels", "simplement", "comment", "quoi", "dont", "donc", "tant",
                "jamais", "rarement", "parfois", "souvent", "toujours", "avec", "pour", "ici",
                ":", "(", ")", "[", "]", "\"", "y", "et", "par", "fois", "peu", "on", "cela",
                "mais", "dans", "en", "à", "au", "même", "là", "-", "si", "comme", "aussi",
                "car", "parce", "quand"]

stopWords = list(stopWords)
stopWords.extend(stop_words)
tmp = ' '.join(stopWords)
tmp = nlp(tmp)
test = [X.lemma_ for X in tmp]
stopWords = list(dict.fromkeys(test))

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wiakx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import spacy
import nltk
import multiprocessing as mp
import concurrent.futures


spacy.prefer_gpu()
nlp = spacy.load("fr_core_news_sm") # more efficient but less accurate
# nlp = spacy.load("fr_dep_news_trf") # less efficient but more accurate


def getLemWord(comment):
    if comment is None:
        return ""
    doc = nlp(comment)
    tokens = [X.lemma_ for X in doc]
    clean_words = []
    for token in tokens:
        if token in stopWords :
            tokens.remove(token)
    return tokens



tqdm.pandas(desc="Lemma words")
# df_dev_fast['lemma_word'] = df_dev_fast['commentaire'].progress_apply(lambda x: getLemWord(x))

comments = df_dev_fast['commentaire'].tolist()
print('on commence')
with concurrent.futures.ThreadPoolExecutor(max_workers=mp.cpu_count()) as executor:
    results = list(tqdm(executor.map(getLemWord, comments), total=len(comments)))


df_dev_fast['lemma_word'] = results

Lemma words: 100%|██████████| 100400/100400 [45:27<00:00, 36.82it/s]   


on commence


100%|██████████| 100400/100400 [1:29:55<00:00, 18.61it/s]   


Serialize df into file (pour save)

In [10]:
def serializeDf(df, path):
    with open(path, 'wb') as f:
        pickle.dump(df, f)
        
serializeDf(df_dev_fast, 'data/df_dev_fast.pkl')
#serializeDf(df_idf_dev_fast, 'data/df_idf_dev_fast.pkl')

In [11]:
def removeCommaInDF():
    for index, row in df_dev_fast.iterrows():
        if row['lemma_word'].count(',') > 0:
            row['lemma_word'].remove(',')

removeCommaInDF()

In [12]:
def listToString(s):  
    str1 = " " 
    return (str1.join(s))

lemma_word_list = df_dev_fast['lemma_word'].tolist()

with concurrent.futures.ThreadPoolExecutor(max_workers=mp.cpu_count()) as executor:
    results = list(tqdm(executor.map(listToString, lemma_word_list), total=len(lemma_word_list)))

df_dev_fast['lemma_word_string'] = results

100%|██████████| 100400/100400 [00:00<00:00, 255291.00it/s]


In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

#instantiate CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in your docs 

word_count_vector=cv.fit_transform(df_dev_fast['lemma_word_string'])
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)
# print idf values 
df_idf_dev_fast = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_lemma_weights"]) 
# sort ascending 
df_idf_dev_fast.sort_values(by=['idf_lemma_weights'])

Unnamed: 0,idf_lemma_weights
le,1.199701
film,1.283584
un,1.390312
ce,1.658640
être,1.796108
...,...
majs,11.823780
majuscules,11.823780
makabé,11.823780
majorem,11.823780


In [14]:
def serializeDf(df, path):
    with open(path, 'wb') as f:
        pickle.dump(df, f)
        
serializeDf(df_dev_fast, 'data/df_dev_fast.pkl')
serializeDf(df_idf_dev_fast, 'data/df_idf_dev_fast.pkl')

In [15]:
from multiprocessing import Pool
from tqdm.contrib.concurrent import process_map

def lemmaWordLow(lemma_word):
    if lemma_word is None:
        return []
    clean_words = []
    for token in lemma_word:
        if token in df_idf_dev_fast.index and df_idf_dev_fast.loc[token]['idf_lemma_weights'] < 10:
            clean_words.append(token)
    return clean_words

def lemmaWordSuperLow(lemma_word):
    if lemma_word is None:
        return []
    clean_words = []
    for token in lemma_word:
        if token in df_idf_dev_fast.index and df_idf_dev_fast.loc[token]['idf_lemma_weights'] < 9:
            clean_words.append(token)
    return clean_words

def removeUselessWords():
    df_dev_fast['lemma_word_low'] = df_dev_fast['lemma_word'].apply(lambda x: lemmaWordLow(x))

def removeSuperUselessWords():
    tqdm.pandas(desc="Deletings useless words")
    df_dev_fast['lemma_word_super_low'] = df_dev_fast['lemma_word'].progress_apply(lambda x: lemmaWordSuperLow(x))

removeUselessWords()
# removeSuperUselessWords()

In [None]:
def serializeDf(df, path):
    with open(path, 'wb') as f:
        pickle.dump(df, f)
        
serializeDf(df_dev_fast, 'data/df_dev_fast.pkl')
serializeDf(df_idf_dev_fast, 'data/df_idf_dev_fast.pkl')