In [1]:
#importation des bibiotheque necessaire
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
np.set_printoptions(precision=2, linewidth=80)
from nltk import FreqDist

# Gensim
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
import re
#from bs4 import BeautifulSoup
import unicodedata

from spacy.lang.fr.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import fr_core_news_md #import spacy french stemmer

In [2]:
#Tokenization
def tokenize_text(corpus):
    tokensCorpus=[]
    for doc in corpus:
        doc_tokens = word_tokenize(doc)
        tokensCorpus.append(doc_tokens)
    return tokensCorpus

# removing stopwords
def remove_stopwords(corpus):
    filtered_corpus=[]
    for tokens in corpus:
        #french_sw = stopwords.words('french') 
        french_sw=list(STOP_WORDS) #get french stopwords
        filtered_tokens = [token for token in tokens.split() if token not in french_sw and len(token)>2]
        filtred_text=' '.join(filtered_tokens) #reforme le text du documents separé par espace
        filtered_corpus.append(filtred_text)
    return filtered_corpus

def make_bigrams(texts):
    list_bigrams=[]
    for doc in texts :
        # Tokenize
        tokens = word_tokenize(doc)
        # bigrams 
        bigrams = [w for w in ngrams(tokens,n=2)]
        list_bigrams.append(bigrams)
    return list_bigrams

def unique_bigrams(texts):
    list_bigrams = make_bigrams(texts)
    list_unique_bigrams=[]
    for list_bigram in list_bigrams :
        list_unique_bigrams.append(list(set(list_bigram)))
    return list_unique_bigrams

def count_unique_bigrams(texts):
    list_unique_bigrams = unique_bigrams(texts)
    list_count_unique_bigrams = []
    for list_unique_bigram in list_unique_bigrams :
        list_count_unique_bigrams.append(len(list_unique_bigram))
    return list_count_unique_bigrams

def tf_function(text,word):
    n=len(word_tokenize(text)) #nombre de mots dans le texte
    m= word_tokenize(text).count(word) # nombre d'occurence du mot en parametre dans le texte
    return  m/n

def idf_function(texts,word):
    list_idf = []
    s = 0
    for text in  texts :
        if word in word_tokenize(text):
            s = s + 1
    return np.log((len(texts)+1)/(s+1))+1

def tf_idf_function(texts,text,word):
    return tf_function(text,word) * idf_function(texts,word)

#output French accents correctly
def convert_accents(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')

#convertisse les documents en minuscule
def lower_text(corpus):
    LowerCorpus=[]
    for doc in corpus:
        lowerDoc=str(doc).lower() #convertissent le texte en minuscules
        lowerDoc=convert_accents(lowerDoc).decode("utf-8") #supprimes les accents
        LowerCorpus.append(lowerDoc)
    return LowerCorpus

#supprimes caracteres speciaux
def remove_characters(corpus,keep_apostrophes=True):
    filtered_corpus=[]
    for doc in corpus:
        doc = doc.strip()
        if keep_apostrophes:
            doc =re.sub('(https|http)\S*\s?', '',doc) #supprimes les urls
            doc =re.sub("l\'","",doc)
            doc =re.sub("d\'","",doc)
            PATTERN = r'[?|$|&|*|-|!|%|@|(|)|~|\d]'
            filtered_doc = re.sub(PATTERN, r'', doc)
            filtered_corpus.append(filtered_doc)
        else:
            PATTERN = r'[^a-zA-Z ]'
            #supprimes les urls
            doc =re.sub('(https|http)\S*\s?', '',doc) #supprimes les urls
            filtered_doc = re.sub(PATTERN, r'', doc)
        
            filtered_corpus.append(filtered_doc)
    return filtered_corpus

#recuperer les mots qui apparaissent dans plusieurs documents
def get_mostCommonWords(corpus,max_freq=100):
    vocabulaire=dict() #dictionnaire qui va contenir le nombre d'occurence des mots dans les documents
    for doc in corpus:
        for word in set(doc.split()): #recupere les mots unique de chaque documents
            if word in vocabulaire:
                vocabulaire[word]+=1
            else:
                vocabulaire[word]=1
    
    #recupere les dont le nombre d'occurences dans les documents > max_freq
    mostCommonsWord=[word for word,value in vocabulaire.items() if value>max_freq ]
        
    return mostCommonsWord

#lemmatisation
def lemm_tokens(corpus):
    
    nlp = fr_core_news_md.load() #initialisation du model "fr_core_news_md" de spacy
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
    corpus_lemms=[]
    
    for document in corpus:
        doc = nlp(document)
        lemms=[token.lemma_ for token in doc if token.pos_ in allowed_postags] #recupere les lemms des tokens
        text=' '.join(lemms) #reforme le text du documents separé par espace
        corpus_lemms.append(text)
            
    return corpus_lemms
#fonction qui supprimes les documents vides ou tres courte
def remove_shortDocument(corpus,min_length=3):
    filtred_corpus=[]
    idx_doc=[]
    for idx,doc in enumerate(corpus):
        
        if len(doc.split())>min_length:
            filtred_corpus.append(doc)
            idx_doc.append(idx)
        
    
    return filtred_corpus,idx_doc

In [3]:
def preprocessing(corpus):
    
    corpus=remove_characters(corpus)
    corpus=lower_text(corpus)
    corpus=remove_stopwords(corpus)
    corpus=lemm_tokens(corpus)
    
    return corpus

In [4]:
dataset = pd.read_csv("data.csv",sep=",")

In [5]:
dataset.head()

Unnamed: 0,text,date,link
0,La Confédération générale des entreprises du M...,2022-06-09,https://www.challenge.ma/maroc-emirats-arabes-...
1,"Selon Bank Al-Maghrib, l’encours du crédit ban...",2022-06-03,https://www.challenge.ma/banques-les-prets-acc...
2,"Dans le cadre des Rencontres du Livre Blanc, l...",2022-06-03,https://www.challenge.ma/la-cgem-met-le-cap-su...
3,Pour la première fois depuis la création des I...,2022-06-02,https://www.challenge.ma/le-maroc-pays-africai...
4,La Fédération Nationale des Promoteurs Immobil...,2022-06-11,https://lematin.ma/express/2022/2eme-edition-s...


In [6]:
corpus = dataset.text.values.tolist()
print("Taille du corpus = "+str(len(corpus))+" Documents")

Taille du corpus = 24 Documents


In [7]:
corpus = preprocessing(corpus)

## TF-IDF

In [15]:
# build TFIDF features on train reviews with a specifique vocabulary
corpus_lemmatized=tokenize_text(corpus) 
id2word = corpora.Dictionary(corpus_lemmatized)
vocabulaire=id2word.token2id #get vocabulary dict where keys are terms and values are indices in the feature matrix

tfidf = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0,sublinear_tf=True,lowercase=True,ngram_range=(1,2),vocabulary=vocabulaire)
tfidf_train_features = tfidf.fit_transform(corpus)

In [9]:
feature_names = tfidf.get_feature_names()

In [19]:
feature_names = tfidf.get_feature_names()
tfidf_train_features_array = tfidf_train_features.toarray()
tf_idf_frame = pd.DataFrame(tfidf_train_features_array,columns=feature_names)

In [20]:
l_bigrams = make_bigrams(corpus)    