In [1]:
import pandas as pd
import numpy as np
import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
FP = pd.read_csv('Desktop/test/FP2.csv')
FP.shape

(19075, 28)

In [3]:
FP = FP.replace(r'^\s+$', np.nan, regex=True) #remplacer espaces vides par des NaN
# FP.isnull().sum() # nombre de NaN par colonne de la DataBase
FP = FP[FP['name'].notna()] # Delete NaN in 'name' columns
FP = FP.reset_index(drop=True)
FP = FP[['rcn', 'name', 'role','country', 'title', 'objective','startDate', 'endDate']]# conserver les seules colonnes qui vous intéresent

In [4]:
FP['namekey'] = FP['name'] #dupliquer colonne avec données
FP.loc[:,"namekey"] = FP.namekey.apply(lambda x : str.lower(x)) #Convert text to lowercase
FP.loc[:,"namekey"] = FP.namekey.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')# remove accent
FP.loc[:,"namekey"] = FP.namekey.str.strip() # Remove whitespaces (at the beginning and at the end of each row)

In [5]:
import re #Remove numbers
FP.loc[:,"namekey"] = FP.namekey.apply(lambda x : " ".join(re.findall('[\w]+',x)))
FP.loc[:,"namekey"] = FP.namekey.str.replace('\d+', '')

In [6]:
import nltk #tokenize namekey
from nltk.tokenize import word_tokenize
FP['namekey'] = FP.apply(lambda row: nltk.word_tokenize(row['namekey']), axis=1)

In [7]:
from nltk.stem.snowball import SnowballStemmer #Stemming is the process of reducing words to their stem
stemmer = SnowballStemmer(language="english") # Use English stemmer. #utiliser d'autres langues ?
FP['namekey'] = FP['namekey'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

In [8]:
stemmer1 = SnowballStemmer(language="french") # Use English stemmer. #utiliser d'autres langues ?
FP['namekey'] = FP['namekey'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

In [9]:
stemmer2 = SnowballStemmer(language="german") # Use English stemmer. #utiliser d'autres langues ?
FP['namekey'] = FP['namekey'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

In [10]:
stemmer3 = SnowballStemmer(language="italian") # Use English stemmer. #utiliser d'autres langues ?
FP['namekey'] = FP['namekey'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

In [11]:
nltk.download('stopwords') #Remove stop words
from nltk.corpus import stopwords
stopwords_list = (stopwords.words('english') +
                        stopwords.words('french')+ 
                        stopwords.words('german')+ 
                        stopwords.words('italian')+ 
                        stopwords.words('greek')+
                        stopwords.words('danish')+
                        stopwords.words('dutch')+
                        stopwords.words('finnish')+
                        stopwords.words('portuguese')+
                        stopwords.words('spanish'))
def remove_stopWords(s):
    '''For removing stop words
    '''
    s = ' '.join(word for word in s.split() if word not in stopwords_list)
    return s
FP.loc[:,"namekey"] = FP.namekey.apply(lambda x : [remove_stopWords(y) for y in x])

[nltk_data] Downloading package stopwords to /Users/leo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
nltk.download('punkt') # Lemmatizing
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_lemmatizer = WordNetLemmatizer()


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence)) #tokenize the sentence and find the POS tag for each token
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) #tuple of (token, wordnet_tag)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word) #if there is no available tag, append the token as is
        else:
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag)) #else use the tag to lemmatize the token
    return " ".join(lemmatized_sentence)

FP.loc[:,"namekey"] = FP['namekey'].apply(lambda x: [lemmatize_sentence(y) for y in x])

[nltk_data] Downloading package punkt to /Users/leo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/leo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/leo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
from nltk.tokenize.treebank import TreebankWordDetokenizer #detokenise "namekey" column
FP['namekey']=FP['namekey'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

In [14]:
print(FP.loc[:,"namekey"].isna().sum()) #last check there is non NaN in the "namekey" column
FP.loc[FP['namekey'].isna(),'namekey'] = FP.loc[FP['namekey'].isna(),'name'] # if "namekey"=NaN après pre-processing, remplacer par 'name' correspondant

0


In [21]:
def sort_pandas(l): #"Harmoniser classement interne des cases"
    l.sort()
    return l
FP.loc[:,"namekey"] = FP.namekey.apply(lambda x : " ".join(sort_pandas(x.split(" "))))

In [None]:
FP.to_csv("Desktop/test/FP_pre_txt_process.csv",mode = 'w', index=False)