In [1]:
import pandas as pd
import os
import glob
from unidecode import unidecode
from datetime import datetime

In [2]:
STOPWORDS = pd.read_csv('stopwords.csv')
STOPWORDS = list(STOPWORDS.Mots)
STOPWORDS_AJOUT = ['script','offset','wikipédia','sujets','sujet','géographie','drapeau','monde','langue','superficie','Europe','Asie','Afrique','Amérique','Océanie','articles','wikicode','modifier','cookies', 'pays','publier','publié','publie','publiés','page','journal','connecter', 'presse', 'médias', 'article','janvier','février','mars','avril','mai','juin','juillet','août','septembre','octobre','novembre','décembre']
STOPWORDS.extend(STOPWORDS_AJOUT)
STOPWORDS_SS_ACCENTS = [unidecode(x) for x in STOPWORDS]
STOPWORDS_SS_ACCENTS = list(set(STOPWORDS_SS_ACCENTS))

SELECTWORDS = pd.read_csv('Lexique382.csv', encoding='cp1252', sep=';')
SELECTWORDS = list(SELECTWORDS.mot)
SELECTWORDS_SS_ACCENTS = [unidecode(str(x)) for x in SELECTWORDS]
SELECTWORDS_SS_ACCENTS = [x for x in SELECTWORDS_SS_ACCENTS if x not in STOPWORDS_SS_ACCENTS]
SELECTWORDS_SS_ACCENTS_SET = set(SELECTWORDS_SS_ACCENTS)

# Constitution d'une base si besoin de grouper des fichiers

In [24]:
def get_merged_csv(f_list, **kwargs):
    return pd.concat([pd.read_csv(f, **kwargs) for f in f_list], ignore_index=True)

base = get_merged_csv(glob.glob('base_résultats_1-2005_12-2005*.csv'), index_col=None)
base2 = get_merged_csv(glob.glob('lignes_en_cours*.csv'), index_col=None)

for df in [base, base2]:
    df.columns = ['Index','Date_debut','Date_fin','Pays', 'Rang','Liste_URL', 'Contenu_page']

In [30]:
base_complete = pd.concat([base,base2], axis = 0)
base_complete.columns = ['Index','Date_debut','Date_fin','Pays', 'Rang','Liste_URL', 'Contenu_page']

In [34]:
base_complete.shape

(7553, 7)

In [35]:
base_complete.Date_debut.unique()

array(['2005-01-01'], dtype=object)

In [36]:
base_complete.to_csv('base_résultats_1-2005_12-2005.csv')

# appel de la base sinon

In [79]:
base_complete = pd.read_csv('base_résultats_1-2005_12-2005.csv')

In [37]:
base_complete.drop_duplicates()
#base_complete[(base_complete.apply(lambda x: any(pd.isna(x)), 1))]
base_complete.dropna()

Unnamed: 0,Index,Date_debut,Date_fin,Pays,Rang,Liste_URL,Contenu_page
0,0,2005-01-01,2005-12-31,Afrique du Sud,0,http://perspective.usherbrooke.ca/bilan/servle...,btlac a btlac a visited btlac a link btlac a a...
1,1,2005-01-01,2005-12-31,Afrique du Sud,1,https://www.cairn.info/revue-politique-africai...,please enable js and disable any ad blockervar...
2,2,2005-01-01,2005-12-31,Afrique du Sud,2,http://www.afriquedusud.fr/,cdata jquery function slider ...
3,3,2005-01-01,2005-12-31,Afrique du Sud,3,https://mappemonde-archive.mgm.fr/num6/article...,sommaire du numero ndeg les p...
4,4,2005-01-01,2005-12-31,Afrique du Sud,4,https://www.lemonde.fr/afrique/article/2005/05...,consulterle journal navigation le mon...
...,...,...,...,...,...,...,...
7548,1867,2005-01-01,2005-12-31,Zimbabwe,23,https://apps.who.int/medicinedocs/fr/d/Js6173f...,portail d information medicaments essentiels...
7549,1868,2005-01-01,2005-12-31,Zimbabwe,24,http://www.astrium.com/fiches-pays/botswana-bw...,fiches pays fiches pays...
7550,1869,2005-01-01,2005-12-31,Zimbabwe,25,https://www.marbreriedesyvelines.com/granit/gr...,materiaux granit ...
7551,1870,2005-01-01,2005-12-31,Zimbabwe,26,http://www.bruneaux.net/html/zimbabwe.htm,zimbabwe parcours traversee de...


## Fonctions de traitement du texte
- suppression des espaces en trop
- tokenisation
- sélection des mots et suppression des sites inexploitables (- de 30 mots retenus)
- lemmatisation

In [38]:
import re
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer


def supprespace(x):
    if type(x) is str:
        return re.sub(' ', ' ',(x.strip()))
    return x

def tokenize(s):
    return(supprespace(str(s)).split(" "))


def keep_goodwords(l):
    goodwords = [x for x in list(set(l).intersection(SELECTWORDS_SS_ACCENTS_SET)) if (len(x) > 3)]
    if len(goodwords)>30:
        return goodwords
    else:
        return "poubelle"
    

def lemmatize(l):
    lemmatizer = FrenchLefffLemmatizer()
    return ' '.join(list(map(lambda s: lemmatizer.lemmatize(str(s)), l)))

def bag_of_words (texte):
    bow = tokenize(texte)
    bow = keep_goodwords(bow)
    return bow

## Lemmatisation et export de la base en 2 étapes

In [39]:
avant = datetime.now()
base_complete['Mots_retenus'] = base_complete.Contenu_page.apply(bag_of_words)
base_complete.to_csv('base_avec_mots_retenus'+base_complete.Date_debut[0][:4]+'.csv')
apres = datetime.now()
print(apres-avant)

0:00:49.197463


In [40]:
avant = datetime.now()
base_complete['Mots_lemm'] = base_complete.Mots_retenus.apply(lemmatize)
base_complete.to_csv('base_lemmatisee'+base_complete.Date_debut[0][:4]+'.csv')
apres = datetime.now()
print(apres-avant)

13:32:26.744766


In [41]:
base_complete.Mots_lemm.head()

0    drogue manne province creees inegalites austra...
1                                      p o u b e l l e
2    exquis raison evenements majeure centaine brac...
3    relie dorsale province definitivement organise...
4    appareil cautionner trilogie petit entreprise ...
Name: Mots_lemm, dtype: object

In [43]:
base_complete.shape

(7553, 9)