In [425]:
import pandas as pd
import numpy as np
import re
import string
# SpaCy
import spacy
from spacy.util import minibatch
from spacy.training import Example
from spacy.lang.pt.stop_words import STOP_WORDS

In [426]:
nlp = spacy.load('de_core_news_md')

In [427]:
df = pd.read_csv('data/reviews-clean.csv', sep=';')

In [428]:
REGX_USERNAME = r"@[A-Za-z0-9$-_@.&+]+"
REGX_URL = r"https?://[A-Za-z0-9./]+"
def preprocessing(text):
    text = text.lower()

    text = re.sub(REGX_USERNAME, ' ', text)
    text = re.sub(REGX_URL, ' ', text)

    emojis = {
        ':)': 'positive emotionen',
        ':(': 'negative emotionen'
    }

    for e in emojis:
        text = text.replace(e, emojis[e])

    tokens = [token.text for token in nlp(text)]

    tokens = [t for t in tokens if
              t not in STOP_WORDS and
              t not in string.punctuation and
              len(t) > 3]

    tokens = [t for t in tokens if not t.isdigit()]

    return " ".join(tokens)
df["caption_clean"] = df["caption"].apply(preprocessing)
df[['caption', 'caption_clean', 'rating']].head()

Unnamed: 0,caption,caption_clean,rating
0,Die mit Abstand leckerste Pizza in Deutschland...,abstand leckerste pizza deutschland selbst wen...,5.0
1,Good place to dine in! Sadly no chicken pizzas...,good place dine sadly chicken pizzas available...,4.0
2,This is the must try pizza in Frankfurt.,this must pizza frankfurt,5.0
3,5/5 Pizza place in Frankfurt,pizza place frankfurt,5.0
4,Leckere Pizzen und coole Atmosphäre. 👍🏼 (,leckere pizzen coole atmosphäre,5.0


In [429]:
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

In [430]:
textcat_multilabel = nlp.add_pipe("textcat_multilabel", config={
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
}, last=True)

# Add the labels to the text classifier
textcat_multilabel.add_label('1')
textcat_multilabel.add_label('2')
textcat_multilabel.add_label('3')
textcat_multilabel.add_label('4')
textcat_multilabel.add_label('5')

1

In [431]:
def load_data(split=0.8):
    split = int(len(df) * split)

    X_train = df['caption_clean'][:split]
    y_train = df['rating'][:split]
    X_test = df['caption_clean'][split:]
    y_test = df['rating'][split:]

    return (X_train, y_train), (X_test, y_test)

In [432]:
# load the dataset
(X_train, y_train), (X_test, y_test) = load_data()

train_data = list(zip(X_train, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_train]))
test_data = list(zip(X_test, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_test]))

In [433]:
train_data[:2]

[('abstand leckerste pizza deutschland selbst wenn halb abends mitnehmen bestellt positive emotionen einfach niederknien',
  {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}),
 ('good place dine sadly chicken pizzas available keep mind waiting time when visit',
  {'cats': {'1': False, '2': False, '3': False, '4': True, '5': False}})]

### Training

In [None]:
nlp = spacy.blank("de")

In [434]:
# Train the text classifier
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()
    for i in range(100): # Iterations
        losses = {}
        np.random.shuffle(train_data)
        for batch in minibatch(train_data, size=8):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], drop=0.3, sgd=optimizer, losses=losses)

#### Genauigkeit

In [435]:
# Evaluate the model
correct = 0
total = 0

for index, row in enumerate(test_data):
    text = row[0]
    rating = row[1]
    rating = list(rating['cats'].keys())[list(rating['cats'].values()).index(True)]

    doc = nlp(text)
    scores = textcat_multilabel.predict([doc])
    if round(doc.cats[rating]):
        correct += 1
    total += 1
accuracy = correct / total
print(f"Accuracy: {round(accuracy, 2)}%")

Accuracy: 0.55%


In [436]:
textcat_multilabel.predict([nlp('')])

array([[0., 0., 0., 0., 0.]])