In [83]:
import pickle
from tqdm import tqdm

In [84]:
stop_words = [" ", "l'", "l’", "la", "le", "les", "d’", "d'", "de", "du", "des", "une", "un",
                "ce", "ces", "je", "moi", "mon", "me", "mes", "tu", "toi", "ton", "te", "tes", 
                "il", "lui", "son", "se", "ses", "nous", "notre", "nos", "vous", "votre", "vos",
                "ils", "leur", "leurs", "n'", "n’", "ne", "tout", "être", "avoir", "deja", "déjà",
                "ou" ,"où", "qu’", "qu'", "que", "qui", "quelle", "quel", "quelles", "quels", 
                ".", ",", ";", "'", "sur", "telle", "tel", "telles", "tels", "laquelle", "lequel",
                "laquelles", "lequels", "simplement", "comment", "quoi", "dont", "donc", "tant",
                "jamais", "rarement", "parfois", "souvent", "toujours", "avec", "pour", "ici",
                ":", "(", ")", "[", "]", "\"", "y", "et", "par", "fois", "peu", "on", "cela",
                "mais", "dans", "en", "à", "au", "même", "là", "-", "si", "comme", "aussi",
                "car", "parce", "quand", "c’", "s’", "s'" "l", "d", "..", "...", "....", ".....",
                "\xa0", "  ", "   ", "    ", "     ", "      ", "       ", "        ", "…", "…",
                "j’", "-là", "-t", "a", "m’", "ca", "c", "l", "n", "s", "j", "x", "*", "–", "/",
                "celui", "celui-ci", "ci", "quell"
                ]

In [85]:
with open(f"train_lemma.pickle", "rb") as infile:
    opinions_train = pickle.load(infile)

with open(f"dev_lemma.pickle", "rb") as infile:
    opinions_dev = pickle.load(infile)

with open("test_lemma.pickle", "rb") as infile:
    opinions_test = pickle.load(infile)

In [86]:
opinions = opinions_train + opinions_dev

In [87]:
def get_tf(opinions):
    tf = {}
    for opinion in opinions:
        for word in opinion:
            if word in tf:
                tf[word]+=1
            else:
                tf[word]=1
    return tf

In [88]:
def remove_rare_words(tf):
    purged = {}
    for term, freq in tf.items():
        if freq > 10:
            purged[term] = freq
    return purged

In [89]:
def remove_irrelevant_words(opinions, common):
    for opinion in tqdm(opinions):
        for word in opinion[:]:
            if word in stop_words or word not in common:
                opinion.remove(word)
    return opinions

In [90]:
tf = get_tf(opinions)
common = remove_rare_words(tf)
opinions = remove_irrelevant_words(opinions, common)

100%|██████████| 766362/766362 [01:27<00:00, 8803.51it/s] 


In [91]:
tf = get_tf(opinions)
relevant = {k: v for k, v in sorted(tf.items(), key=lambda item: item[1], reverse=True)[:1024]}

In [92]:
def remove_irrelevant_words(opinions, relevant):
    for opinion in tqdm(opinions):
        for word in opinion[:]:
            if word not in relevant:
                opinion.remove(word)
    return opinions

In [93]:
opinions_train = remove_irrelevant_words(opinions_train, relevant)
opinions_dev = remove_irrelevant_words(opinions_dev, relevant)
opinions_test = remove_irrelevant_words(opinions_test, relevant)

100%|██████████| 665962/665962 [00:05<00:00, 115871.24it/s]
100%|██████████| 100400/100400 [00:00<00:00, 116621.12it/s]
100%|██████████| 85847/85847 [00:02<00:00, 29533.59it/s]


In [94]:
def create_dict_of_words(opinions):
    word_to_id = {}
    index = 0
    for opinion in opinions:
        for token in opinion:
            if token not in word_to_id:
                word_to_id[token] = index
                index += 1
    return word_to_id

In [95]:
word_to_id = create_dict_of_words(opinions)
print(len(word_to_id))

1024


In [96]:
def comment_to_vec(opinions):
    vectors = []
    for opinion in tqdm(opinions):
        vector = [0] * 1024
        for token in opinion:
            vector[word_to_id[token]] += 1
        vectors.append(vector)
    return vectors

In [97]:
vectors_train = comment_to_vec(opinions_train)
vectors_dev = comment_to_vec(opinions_dev)
vectors_test = comment_to_vec(opinions_test)

100%|██████████| 665962/665962 [00:22<00:00, 29398.30it/s]
100%|██████████| 100400/100400 [00:01<00:00, 98642.53it/s]
100%|██████████| 85847/85847 [00:00<00:00, 121198.35it/s]


In [98]:
with open("train.pickle", "rb") as infile:
    comments_train = pickle.load(infile)

with open("dev.pickle", "rb") as infile:
    comments_dev = pickle.load(infile)

with open("test.pickle", "rb") as infile:
    comments_test = pickle.load(infile)

In [99]:
def get_notes(comments):
    notes = []
    for comment in comments.values():
        note = comment.get_note().replace(",", ".")
        notes.append(int((float(note) - 0.5) * 2))
    return notes

In [101]:
notes_train = get_notes(comments_train)
notes_dev = get_notes(comments_dev)

In [128]:
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.utils import to_categorical

import numpy as np

In [110]:
data_train = numpy.array(vectors_train)
data_dev = numpy.array(vectors_dev)
data_test = numpy.array(vectors_test)

In [120]:
model_10 = Sequential()
model_10.add(Dense(512, input_dim=1024, activation='relu'))
model_10.add(Dense(256, activation='relu'))
model_10.add(Dense(10, activation='softmax'))
model_10.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

result_train = to_categorical(notes_train, 10)
result_dev = to_categorical(notes_dev, 10)

In [121]:
model_10.fit(data_train, result_train, epochs=2, verbose=1, validation_data=(data_dev, result_dev))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x29348165670>

In [122]:
prediction = model_10.predict(data_test)



In [133]:
def read_prediction(prediction):
    with open("leaderboard.txt", "w") as score:
        index = 0
        for result in prediction:
            note = float((np.argmax(result) + 1) / 2)
            comma = str(note).replace(".", ",")
            score.write(f"{comments_test[index].get_review_id()} {comma}\n")
            index+=1

In [134]:
read_prediction(prediction)