In [26]:
import numpy as np
import pandas as pd
import re
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import FrenchStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [31]:
def get_dataset(data_path = "../data/", train = True):
    """
    Read data from DEFT
    :return: DataFrame
    """
    _set = "train" if train else "test"
    with open(f"{data_path}references/{_set}.txt") as f:
        labels = dict(map(lambda x: x[:-1].split('\t'), f.readlines()))
    for _id in list(labels.keys()):
        try:
            with open(f"{data_path}tweets/{_set}/{_id}.txt",
                      encoding = "utf8") as f:
                tweet = f.read()[:-1]
        except:
            del labels[_id]
        else:
            labels[_id] = {"label": labels[_id], "tweet": tweet}
    return pd.DataFrame.from_dict(labels, orient = 'index')

In [3]:
train = get_dataset()
test = get_dataset(train = False)
train, val = train_test_split(train, test_size = .2, stratify = train.label, random_state = 42069)

In [27]:
nlp = spacy.load('fr_core_news_sm')
stemmer = FrenchStemmer()

In [35]:
def clean_tweet(text, **kwargs):
    """
    Clean tweet content. All **kwargs are bool
    text: (str) tweet to be cleaned
    **kwargs: lower, url, hashtag, user, symbol, digit, lemma, stopwords, stem
    :return: (str) cleaned content
    """
    global nlp, stemmer
    text = text.lower() if kwargs.get('lower', True) else text
    text = re.sub('http[s]?://\S+', '', text) if kwargs.get('url', True) else text
    text = re.sub('#\S+', '', text) if kwargs.get('hashtag', False) else text
    text = re.sub('@\S+', '', text) if kwargs.get('user', False) else text
    text = re.sub('[^0-9A-zÀ-ÿ +_]', ' ', text) if kwargs.get('symbol', True) else text
    text = re.sub('\d+', ' ', text) if kwargs.get('digit', False) else text

    text = re.sub(' +', ' ', text)
    text = map(lambda x: x.lemma_, nlp(text)) if kwargs.get(
        'lemma', True) else word_tokenize(text, language = 'french')
    text = filter(lambda x: x not in stopwords.words('french'),
                  text) if kwargs.get('stopwords', True) else text
    text = map(stemmer.stem, text) if kwargs.get('stem', True) else text
    return ' '.join(text)

In [55]:
train_basic = train.copy()
val_basic = val.copy()
test_basic = test.copy()

In [56]:
train_basic.tweet = train_basic.tweet.apply(clean_tweet, lemma = False, stem = False, lower = False, stopwords = False)
val_basic.tweet = val_basic.tweet.apply(clean_tweet, lemma = False, stem = False, lower = False, stopwords = False)
test_basic.tweet = test_basic.tweet.apply(clean_tweet, lemma = False, stem = False, lower = False, stopwords = False)

In [57]:
train_basic

Unnamed: 0,label,tweet
520701162891137024,INFORMATION,Affichageenvironnemental sur les produits de c...
506710752598573057,INFORMATION,Dossier npaconseil M paiement m wallet un écos...
488740999301443585,OPINION,Un bel article qui montre que la culture est u...
520498601299300352,INFORMATION,Quelques exemples d actions d agendas 21 locau...
520220253251698688,OPINION,_PhMartin_ clamajakri L écologie fiscale punit...
...,...,...
489334928124153856,OPINION,Agriculture Des éleveurs s engagent pour la bi...
507328152616841216,INFORMATION,Développement durable et biotech 3ème rencontr...
487618646412230656,OPINION,Impressionnante petite élise d éoliennes Wow d...
519002774084677632,OPINION,Présidentielle au Brésil Rousseff et Neves au ...


In [60]:
test_basic.to_pickle("../data/datasets/test_basic.pkl")