Challenge Data

In [1]:
import pandas as pd, nltk, string, spacy
from nltk.corpus import stopwords

DIRECTORY = "challenge_dataset/"

X_train = pd.read_csv(DIRECTORY + "X_train.csv", sep=';').drop(columns=['Id'])
X_train

Unnamed: 0,Caption
0,de mourir avant l'heure\n de ne plus revoir me...
1,la maladie pour les autres et pour moi\n et le...
2,Comment vont s'en sortir ceux qui sont mal ou ...
3,"Inquiétude pour la santé de mes proches, pour ..."
4,"Bien entendu contracter la maladie,"
...,...
480,avoir une forme grave du civid
481,Inquiétude de ne pas retrouver une liberté d'a...
482,L'incertitude dans laquelle nous sommes. \n Ne...
483,"inquiétude normale face à une épidémie, craint..."


In [2]:
y_train = pd.read_csv(DIRECTORY + "y_train.csv", sep=';').drop(columns=['Id'])
y_train

Unnamed: 0,category_1,category_2,category_3,category_4
0,1,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,1,0,0,0
...,...,...,...,...
480,1,0,0,0
481,0,0,0,1
482,0,1,0,1
483,0,1,0,0


In [3]:
#https://maelfabien.github.io/machinelearning/NLPfr/#1-tokenisation

#python -m nltk.downloader all
nltk.download("stopwords")

def return_token(sentence):
    #print(sentence)
    doc=nltk.word_tokenize(sentence, language='french')
    return [X for X in doc]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ganae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
for line in X_train['Caption']:
    print(return_token(line))

['de', 'mourir', 'avant', "l'heure", 'de', 'ne', 'plus', 'revoir', 'mes', 'petits', 'enfants']
['la', 'maladie', 'pour', 'les', 'autres', 'et', 'pour', 'moi', 'et', 'les', 'conséquences', 'sur', 'la', 'jeune', 'génération', 'qui', 'vit', 'une', 'situation', 'tout', 'à', 'fait', 'insolite', ':', "l'arrêt", 'de', 'toute', 'vie', 'sociale', 'et', 'active']
['Comment', 'vont', "s'en", 'sortir', 'ceux', 'qui', 'sont', 'mal', 'ou', 'pas', 'du', 'tout', 'logées', '?']
['Inquiétude', 'pour', 'la', 'santé', 'de', 'mes', 'proches', ',', 'pour', 'les', 'plus', 'fragiles', '(', 'femmes', 'battues', ',', 'enfants', 'en', 'danger', ',', 'prisonniers', ',', 'sdf', ')', ',', 'pour', 'la', 'crise', 'économique', 'avérée', 'et', 'une', 'possible', 'crise', 'sociale', 'par', 'la', 'suite', '.']
['Bien', 'entendu', 'contracter', 'la', 'maladie', ',']
['La', 'multiplication', 'des', 'pandémies', 'pour', "l'avenir"]
['de', 'nombreuses', 'reprises', 'de', 'la', 'pandémie', 'au', 'cours', 'des', 'prochaines',

In [7]:
def treat_data(input_data):
    
    allStopwords = ["alors", "au", "aucuns", "aussi", "autre", "avant", "avec", "avoir", "bon", "car", "ce", "cela", "ces", "ceux", "chaque", "ci", "comme", "comment", "dans", "des", "du", "dedans", "dehors", "depuis", "devrait", "doit", "donc", "dos", "début", "elle", "elles", "en", "encore", "essai", "est", "et", "eu", "fait", "faites", "fois", "font", "hors", "ici", "il", "ils", "je", "juste", "la", "le", "les", "leur", "là", "ma", "maintenant", "mais", "mes", "mien", "moins", "mon", "mot", "même", "ni", "nommés", "notre", "nous", "ou", "où", "par", "parce", "pas", "peut", "peu", "plupart", "pour", "pourquoi", "quand", "que", "quel", "quelle", "quelles", "quels", "qui", "sa", "sans", "ses", "seulement", "si", "sien", "son", "sont", "sous", "soyez"    , "sujet", "sur", "ta", "tandis", "tellement", "tels", "tes", "ton", "tous", "tout", "trop", "très", "tu", "voient", "vont", "votre", "vous", "vu", "ça", "étaient", "état", "étions", "été", "être", "l'on"]
    for x in stopwords.words("french"):
        allStopwords.append(x)

    #remove duplicates and add punctuation
    allStopwords.extend(string.punctuation)
    allStopwords.append("...")
    allStopwords = list(dict.fromkeys(allStopwords))
    
    #tokenize captions
    captions = [return_token(line) for line in input_data['Caption']]
    
    #clean captions
    captions_cleaned = []
    for message in captions:
        captions_cleaned.append([word.lower() for word in message if word not in allStopwords])
    
    #lemmatized captions
    #DOWNLOAD MODEL with conda install -c conda-forge spacy-model-fr_core_news_md
    #                 or python -m spacy download fr_core_news_md
    nlp = spacy.load('fr_core_news_sm')
    
    captions_cleaned_stemmed_and_lemmatized_by_spacy = []

    for sentence in captions_cleaned:
        #nlp on sentences (create sentence back from list of words separated by " ")
        captions_cleaned_stemmed_and_lemmatized_by_spacy.append(nlp(' '.join(sentence)))
    
    captions_temp = []
    POS_captions = []
    for sentence in captions_cleaned_stemmed_and_lemmatized_by_spacy:
        captions_temp.append([token.lemma_ for token in sentence])
        POS_captions.append([token.pos_ for token in sentence])
    
    captions_cleaned_stemmed_and_lemmatized_by_spacy=captions_temp
    
    lemmes = pd.Series(captions_cleaned_stemmed_and_lemmatized_by_spacy, name='lemmes', dtype='object')
    pos = pd.Series(POS_captions, name='pos', dtype='object')
    
    return lemmes, pos

In [8]:
X_train_clean = treat_data(X_train)
X_train_clean

([['mourir', 'le', 'heure', 'plus', 'revoir', 'petit', 'enfant'],
  ['maladie',
   'autre',
   'conséquence',
   'jeune',
   'génération',
   'voir',
   'situation',
   'insolite',
   'le',
   'arrêt',
   'tout',
   'vie',
   'social',
   'actif'],
  ['comment', 'se', 'en', 'sortir', 'mal', 'loger'],
  ['inquiétude',
   'santé',
   'proche',
   'plus',
   'fragile',
   'femme',
   'battre',
   'enfant',
   'danger',
   'prisonnier',
   'sdf',
   'crise',
   'économique',
   'avérer',
   'possible',
   'crise',
   'social',
   'suite'],
  ['bien', 'entendre', 'contracter', 'maladie'],
  ['le', 'multiplication', 'pandémie', 'le', 'avenir'],
  ['nombreux',
   'reprise',
   'pandémie',
   'cours',
   'prochain',
   'année',
   'fermeture',
   'pays',
   'crise',
   'économique',
   'touche',
   'énorme',
   'partie',
   'population',
   'mondial'],
  ['effondrement', 'economir'],
  ['je',
   'craindre',
   'modification',
   'nature',
   'relation',
   'entrer',
   'personne',
   'vers',
 