In [1]:
import pickle
import spacy

from tqdm import tqdm

nlp = spacy.load('fr_core_news_lg')

In [2]:
stop_words = [" ", "l'", "l’", "la", "le", "les", "d’", "d'", "de", "du", "des", "une", "un",
                "ce", "ces", "je", "moi", "mon", "me", "mes", "tu", "toi", "ton", "te", "tes", 
                "il", "lui", "son", "se", "ses", "nous", "notre", "nos", "vous", "votre", "vos",
                "ils", "leur", "leurs", "n'", "ne", "tout", "être", "avoir", "deja", "déjà",
                "ou" ,"où", "qu’", "qu'", "que", "qui", "quelle", "quel", "quelles", "quels", 
                ".", ",", "...", "sur", "telle", "tel", "telles", "tels", "laquelle", "lequel",
                "laquelles", "lequels", "simplement", "comment", "quoi", "dont", "donc", "tant",
                "jamais", "rarement", "parfois", "souvent", "toujours", "avec", "pour", "ici",
                ":", "(", ")", "[", "]", "\"", "y", "et", "par", "fois", "peu", "on", "cela",
                "mais", "dans", "en", "à", "au", "même", "là", "-", "si", "comme", "aussi",
                "car", "parce", "quand"]

In [24]:
with open(f"train.pickle", "rb") as infile:
    comments_train = pickle.load(infile)

with open(f"dev.pickle", "rb") as infile:
    comments_dev = pickle.load(infile)

with open(f"test.pickle", "rb") as infile:
    comments_test = pickle.load(infile)

In [59]:
def lemmatization(comments):
    opinions = []
    for comment in tqdm(comments.values()):
        tokens = nlp(comment.get_comment())
        opinion = []
        for token in tokens:
            opinion.append(token.lemma_)
        opinions.append(opinion)
    return opinions
    

In [35]:
opinions_train = lemmatization(comments_train)
with open("train_lemma.pickle", "wb") as outfile:
    pickle.dump(opinions_train, outfile)

opinions_dev = lemmatization(comments_dev)
with open("dev_lemma.pickle", "wb") as outfile:
    pickle.dump(opinions_dev, outfile)

opinions_test = lemmatization(comments_test)
with open("test_lemma.pickle", "wb") as outfile:
    pickle.dump(opinions_test, outfile)

100%|██████████| 665962/665962 [2:02:34<00:00, 90.55it/s]   


In [101]:
with open("train_lemma.pickle", "rb") as infile:
    opinions_train = pickle.load(infile)

with open("dev_lemma.pickle", "rb") as infile:
    opinions_dev = pickle.load(infile)

with open("test_lemma.pickle", "rb") as infile:
    opinions_test = pickle.load(infile)

In [102]:
def get_tf(opinions):
    tf = {}
    for opinion in opinions:
        for word in opinion:
            if word in tf:
                tf[word]+=1
            else:
                tf[word]=1
    return tf

In [103]:
def remove_rare_words(tf):
    purged = {}
    for term, freq in tf.items():
        if freq > 10:
            purged[term] = freq
    return purged

In [1]:
def remove_irrelevant_words(opinions, common):
    for opinion in tqdm(opinions):
        for word in opinion[:]:
            if word in stop_words or word not in common:
                opinion.remove(word)
    return opinions

In [105]:
freq_train = get_tf(opinions_train)
common = remove_rare_words(freq_train)
opinions_train = remove_irrelevant_words(opinions_train, common)

opinions_dev = remove_irrelevant_words(opinions_dev, common)

opinions_test = remove_irrelevant_words(opinions_test, common)

100%|██████████| 665962/665962 [00:58<00:00, 11395.21it/s]
100%|██████████| 100400/100400 [00:08<00:00, 11387.78it/s]


In [106]:
opinions = opinions_train + opinions_dev

In [18]:
def create_dict_of_words(opinions):
    word_to_id = {}
    index = 1
    for opinion in opinions:
        for token in opinion:
            if token not in word_to_id:
                word_to_id[token] = index
                index += 1
    return word_to_id

In [19]:
word_to_id = create_dict_of_words(opinions)

49012


In [62]:
def comment_to_bow(comment):
    dico = {}
    for token in comment:
        if token in word_to_id:
            if word_to_id[token] in dico:
                dico[word_to_id[token]] += 1
            else:
                dico[word_to_id[token]] = 1
    return dico

In [32]:
comments_train_bow = []
for opinion in tqdm(opinions_train):
    bow = comment_to_bow(opinion)
    comments_train_bow.append(bow)
with open("train_bow.pickle", "wb") as outfile:
    pickle.dump(comments_train_bow, outfile)

comments_dev_bow = []
for opinion in tqdm(opinions_dev):
    bow = comment_to_bow(opinion)
    comments_dev_bow.append(bow)
with open("dev_bow.pickle", "wb") as outfile:
    pickle.dump(comments_dev_bow, outfile)

comments_test_bow = []
for opinion in tqdm(opinions_test):
    bow = comment_to_bow(opinion)
    comments_test_bow.append(bow)
with open("test_bow.pickle", "wb") as outfile:
    pickle.dump(comments_test_bow, outfile)

100%|██████████| 665962/665962 [00:06<00:00, 100956.93it/s]
100%|██████████| 100400/100400 [00:00<00:00, 112875.56it/s]


In [37]:
def get_notes(comments):
    notes = []
    for comment in comments.values():
        note = comment.get_note().replace(",", ".")
        notes.append(int((float(note) - 0.5) * 2))
    return notes

In [38]:
notes_train = get_notes(comments_train)
notes_dev = get_notes(comments_dev)

In [55]:
def to_svm(comments, notes, corpus):
    with open(f"{corpus}.svm", "w") as outfile:
        for i, comment in tqdm(enumerate(comments)):
            outfile.write(str(notes[i]))
            for key, value in sorted(comment.items()):
                outfile.write(f" {key}:{value}")
            outfile.write("\n")

In [56]:
to_svm(comments_train_bow, notes_train, "train")
to_svm(comments_dev_bow, notes_dev, "dev")
to_svm(comments_test_bow, [7] * len(comments_test_bow), "test")

665962it [00:13, 48287.74it/s]
100400it [00:02, 49602.86it/s]


In [75]:
def read_results():
    with open("out.txt", "r") as results:
        with open("leaderboard.txt", "w") as predictions:
            index = 0
            for line in results:
                note = str((int(line)+1)/2).replace(".", ",")
                predictions.write(f"{comments_test[index].get_review_id()} {note}\n")
                index+=1

In [76]:
read_results()