In [2]:
#importation des bibiotheque necessaire
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
np.set_printoptions(precision=2, linewidth=80)
from nltk import FreqDist

# Gensim
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel

import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams
import re
#from bs4 import BeautifulSoup
import unicodedata

from spacy.lang.fr.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import fr_core_news_md #import spacy french stemmer

In [20]:
#Tokenization
def tokenize_text(corpus):
    tokensCorpus=[]
    for doc in corpus:
        doc_tokens = word_tokenize(doc)
        tokensCorpus.append(doc_tokens)
    return tokensCorpus

# removing stopwords
def remove_stopwords(corpus):
    filtered_corpus=[]
    for tokens in corpus:
        #french_sw = stopwords.words('french') 
        french_sw=list(STOP_WORDS) #get french stopwords
        filtered_tokens = [token for token in tokens.split() if token not in french_sw and len(token)>2]
        filtred_text=' '.join(filtered_tokens) #reforme le text du documents separé par espace
        filtered_corpus.append(filtred_text)
    return filtered_corpus

def make_bigrams(texts):
    list_bigrams=[]
    for doc in texts :
        # Tokenize
        tokens = word_tokenize(doc)
        # bigrams 
        bigrams = [w for w in ngrams(tokens,n=2)]
        list_bigrams.append(bigrams)
    return list_bigrams

def unique_bigrams(texts):
    list_bigrams = make_bigrams(texts)
    list_unique_bigrams=[]
    for list_bigram in list_bigrams :
        list_unique_bigrams.append(list(set(list_bigram)))
    return list_unique_bigrams

def count_unique_bigrams(texts):
    list_unique_bigrams = unique_bigrams(texts)
    list_count_unique_bigrams = []
    for list_unique_bigram in list_unique_bigrams :
        list_count_unique_bigrams.append(len(list_unique_bigram))
    return list_count_unique_bigrams

def tf_function(bigrams,bigram):
    n=len(bigrams) #nombre de bigrams uniques dans le texte
    m= bigrams.count(bigram) # nombre d'occurence du bigram en parametre dans la liste de bigrams
    return  m/(n+1)

def idf_function(whole_bigram,bigram):
    list_idf = []
    s = 0
    for bigrams in whole_bigram:  
        if bigram in bigrams:
            s += 1
    return np.log((len(whole_bigram)/(s+1))+1)

def tf_idf_function(whole_bigram,bigrams,bigram):
    return tf_function(bigrams,bigram) * idf_function(whole_bigram,bigram)

#output French accents correctly
def convert_accents(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')

#convertisse les documents en minuscule
def lower_text(corpus):
    LowerCorpus=[]
    for doc in corpus:
        lowerDoc=str(doc).lower() #convertissent le texte en minuscules
        lowerDoc=convert_accents(lowerDoc).decode("utf-8") #supprimes les accents
        LowerCorpus.append(lowerDoc)
    return LowerCorpus

#supprimes caracteres speciaux
def remove_characters(corpus,keep_apostrophes=True):
    filtered_corpus=[]
    for doc in corpus:
        doc = doc.strip()
        if keep_apostrophes:
            doc =re.sub('(https|http)\S*\s?', '',doc) #supprimes les urls
            doc =re.sub("l\'","",doc)
            doc =re.sub("d\'","",doc)
            PATTERN = r'[?|$|&|*|-|!|%|@|(|)|~|\d]'
            filtered_doc = re.sub(PATTERN, r'', doc)
            filtered_corpus.append(filtered_doc)
        else:
            PATTERN = r'[^a-zA-Z ]'
            #supprimes les urls
            doc =re.sub('(https|http)\S*\s?', '',doc) #supprimes les urls
            filtered_doc = re.sub(PATTERN, r'', doc)
        
            filtered_corpus.append(filtered_doc)
    return filtered_corpus

#recuperer les mots qui apparaissent dans plusieurs documents
def get_mostCommonWords(corpus,max_freq=100):
    vocabulaire=dict() #dictionnaire qui va contenir le nombre d'occurence des mots dans les documents
    for doc in corpus:
        for word in set(doc.split()): #recupere les mots unique de chaque documents
            if word in vocabulaire:
                vocabulaire[word]+=1
            else:
                vocabulaire[word]=1
    
    #recupere les dont le nombre d'occurences dans les documents > max_freq
    mostCommonsWord=[word for word,value in vocabulaire.items() if value>max_freq ]
        
    return mostCommonsWord

#lemmatisation
def lemm_tokens(corpus):
    
    nlp = fr_core_news_md.load() #initialisation du model "fr_core_news_md" de spacy
    allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
    corpus_lemms=[]
    
    for document in corpus:
        doc = nlp(document)
        lemms=[token.lemma_ for token in doc if token.pos_ in allowed_postags] #recupere les lemms des tokens
        text=' '.join(lemms) #reforme le text du documents separé par espace
        corpus_lemms.append(text)
            
    return corpus_lemms
#fonction qui supprimes les documents vides ou tres courte
def remove_shortDocument(corpus,min_length=3):
    filtred_corpus=[]
    idx_doc=[]
    for idx,doc in enumerate(corpus):
        
        if len(doc.split())>min_length:
            filtred_corpus.append(doc)
            idx_doc.append(idx)
        
    
    return filtred_corpus,idx_doc

In [5]:
def preprocessing(corpus):
    
    corpus=remove_characters(corpus)
    corpus=lower_text(corpus)
    corpus=remove_stopwords(corpus)
    corpus=lemm_tokens(corpus)
    
    return corpus

In [6]:
df1 = pd.read_csv(r"..\Scraping\Raw_data\1.csv", sep=",")
df2 = pd.read_csv(r"..\Scraping\Raw_data\2.csv", sep=",")
df3 = pd.read_csv(r"..\Scraping\Raw_data\3.csv", sep=",")
dataset = pd.concat([df1, df2, df3])
dataset.head()

Unnamed: 0,text,date,link
0,La Confédération générale des entreprises du M...,2022-06-09,https://www.challenge.ma/maroc-emirats-arabes-...
1,"Selon Bank Al-Maghrib, l’encours du crédit ban...",2022-06-03,https://www.challenge.ma/banques-les-prets-acc...
2,"Dans le cadre des Rencontres du Livre Blanc, l...",2022-06-03,https://www.challenge.ma/la-cgem-met-le-cap-su...
3,Pour la première fois depuis la création des I...,2022-06-02,https://www.challenge.ma/le-maroc-pays-africai...
4,Les levées des capitaux se sont établies à plu...,2022-06-01,https://www.challenge.ma/marche-des-capitaux-l...


In [7]:
dataset.head()

Unnamed: 0,text,date,link
0,La Confédération générale des entreprises du M...,2022-06-09,https://www.challenge.ma/maroc-emirats-arabes-...
1,"Selon Bank Al-Maghrib, l’encours du crédit ban...",2022-06-03,https://www.challenge.ma/banques-les-prets-acc...
2,"Dans le cadre des Rencontres du Livre Blanc, l...",2022-06-03,https://www.challenge.ma/la-cgem-met-le-cap-su...
3,Pour la première fois depuis la création des I...,2022-06-02,https://www.challenge.ma/le-maroc-pays-africai...
4,Les levées des capitaux se sont établies à plu...,2022-06-01,https://www.challenge.ma/marche-des-capitaux-l...


In [8]:
corpus = dataset.text.values.tolist()
print("Taille du corpus = "+str(len(corpus))+" Documents")

Taille du corpus = 74 Documents


In [9]:
corpus = preprocessing(corpus)

## TF-IDF

In [17]:
# build TFIDF features on train reviews with a specifique vocabulary
corpus_lemmatized=tokenize_text(corpus) 
id2word = corpora.Dictionary(corpus_lemmatized)
vocabulaire=id2word.token2id #get vocabulary dict where keys are terms and values are indices in the feature matrix

tfidf = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0,sublinear_tf=True,lowercase=True,ngram_range=(1,2),vocabulary=vocabulaire)
tfidf_train_features = tfidf.fit_transform(corpus)

In [18]:
feature_names = tfidf.get_feature_names()

In [19]:
feature_names = tfidf.get_feature_names()
tfidf_train_features_array = tfidf_train_features.toarray()
tf_idf_frame = pd.DataFrame(tfidf_train_features_array,columns=feature_names)

In [21]:
tf_idf_frame.head()

Unnamed: 0,-,abdelmajid,abdullah,actuel,affaire,affirmer,ajoutee,ajouter,albare,algerienn,...,migrate,outsourcia,reagissant,relais,sp,spe,sujette,suspensif,trajectoire,usuel
0,0.0,0.068419,0.115843,0.039628,0.052164,0.054862,0.046165,0.041909,0.068419,0.068419,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## bigrams processing

In [10]:
list_bigrams = make_bigrams(corpus)

In [14]:
l=[]
for bigrams in list_bigrams :
    l.append(len(bigrams))
len(l)

74

In [37]:
def data_frame_bigrams(list_bigrams):
    """Create DataFrame with list of bigrams to be used in association rule learning with R"""
    
    # Create a list of column names
    columns_bigrams_df = []
    for bigrams in list_bigrams :
        for bigram in bigrams :
            if bigram not in columns_bigrams_df :
                columns_bigrams_df.append(bigram)
    
    # Create dataframe with bigrams
    data_frame_bigrams = []
    for bigrams in list_bigrams :
        l_bigrams=[0 for i in range(len(columns_bigrams_df))]
        for bigram in bigrams :
            l_bigrams[columns_bigrams_df.index(bigram)]=1
        data_frame_bigrams.append(l_bigrams)
    
    df = pd.DataFrame(data_frame_bigrams,columns=columns_bigrams_df)

    return df

def  data_frame_tf_idf_bigrams(list_bigrams):
    """Create TF-IDF DataFrame with list of bigrams that contains the columns TF and IDF of each bigram"""
    
    # Create a list of column names
    
    unique_bigrams_columns = []
    intermediate = []
    list_tf_idf_bigrams = []
    for bigrams in list_bigrams :
        for bigram in bigrams :
            if bigram not in unique_bigrams_columns :
                unique_bigrams_columns.extend(bigram)
                intermediate.extend([bigram,bigram])
                list_tf_idf_bigrams.extend(["TF","IDF"])

    # list of column names ==> 2 indexes : bigram and TF/IDF
    columns_bigrams_df = [intermediate,list_tf_idf_bigrams] 

    index=["C1","C2","C3","C4","C5","C6","C7","C8","C9","C10","C11","C12","C13","C14",
    "C15","C16","C17","C18","C19","C20","C21","C22","C23","C24","C25","C26","C27","C28","C29","C30","C31","C32",
    "C33","C34","C35","C36","C37","C38","C39","C40","C41","C42","C43","C44","C45","C46","C47","C48","C49","C50",
    "C51","C52","C53","C54","C55","C56","C57","C58","C59","C60","C61","C62","C63","C64","C65","C66","C67","C68","C69",
    "C70","C71","C72","C73","C74"]
    
    # Create tf-idf dataframe with bigrams
    data_frame_bigrams = []

    for bigrams in list_bigrams :
        l_bigrams=[]
        for bigram in unique_bigrams_columns :
            l_bigrams.extend([tf_function(bigrams,bigram),idf_function(list_bigrams,bigram)])
        data_frame_bigrams.append(l_bigrams)
    

    df = pd.DataFrame(data_frame_bigrams,columns=columns_bigrams_df,index=index)

    return df    

In [22]:
df_bigrams = data_frame_bigrams(list_bigrams)

In [39]:
tf_idf_bigrams = data_frame_tf_idf_bigrams(list_bigrams)

In [36]:
l1 = ['a', 'a', 'b', 'b']
l2 = ["TF", "IDF", "TF", "IDF"]
l=[l1,l2]
pd.DataFrame(np.random.rand(2, 4),index=["data1" , "data2"],columns=l)

Unnamed: 0_level_0,a,a,b,b
Unnamed: 0_level_1,TF,IDF,TF,IDF
data1,0.180393,0.343482,0.441857,0.616847
data2,0.381456,0.79481,0.965859,0.682141
