In [437]:
import pandas as pd
import numpy as np
# SpaCy
import spacy
from spacy.util import minibatch
from spacy.training import Example

In [438]:
nlp = spacy.load('de_core_news_md')

In [447]:
df = pd.read_csv('data/reviews-clean.csv', sep=';')
df.head()

Unnamed: 0.1,Unnamed: 0,caption,rating,timestamp,caption_clean
0,0,Die mit Abstand leckerste Pizza in Deutschland...,5.0,2023-02-15 22:10:07.146586,abstand leckerste pizza deutschland selbst wen...
1,4,Good place to dine in! Sadly no chicken pizzas...,4.0,2023-02-14 00:10:07.146703,good place dine sadly chicken pizzas available...
2,5,This is the must try pizza in Frankfurt.,5.0,2023-02-14 00:10:07.146705,this must pizza frankfurt
3,6,5/5 Pizza place in Frankfurt,5.0,2023-02-14 00:10:07.146707,pizza place frankfurt
4,7,Leckere Pizzen und coole Atmosphäre. 👍🏼 (,5.0,2023-02-13 00:10:07.146710,leckere pizzen coole atmosphäre


In [440]:
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

In [441]:
textcat_multilabel = nlp.add_pipe("textcat_multilabel", config={
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
}, last=True)

# Add the labels to the text classifier
textcat_multilabel.add_label('1')
textcat_multilabel.add_label('2')
textcat_multilabel.add_label('3')
textcat_multilabel.add_label('4')
textcat_multilabel.add_label('5')

1

In [442]:
def load_data(split=0.8):
    split = int(len(df) * split)

    X_train = df['caption_clean'][:split]
    y_train = df['rating'][:split]
    X_test = df['caption_clean'][split:]
    y_test = df['rating'][split:]

    return (X_train, y_train), (X_test, y_test)

In [443]:
# load the dataset
(X_train, y_train), (X_test, y_test) = load_data()

train_data = list(zip(X_train, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_train]))
test_data = list(zip(X_test, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_test]))

In [444]:
train_data[:2]

[('abstand leckerste pizza deutschland selbst wenn halb abends mitnehmen bestellt positive emotionen einfach niederknien',
  {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}),
 ('good place dine sadly chicken pizzas available keep mind waiting time when visit',
  {'cats': {'1': False, '2': False, '3': False, '4': True, '5': False}})]

### Training

In [445]:
nlp = spacy.blank("de")

In [446]:
# Train the text classifier
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()
    for i in range(100): # Iterations
        losses = {}
        np.random.shuffle(train_data)
        for batch in minibatch(train_data, size=8):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], drop=0.3, sgd=optimizer, losses=losses)

TypeError: object of type 'float' has no len()

#### Genauigkeit

In [None]:
# Evaluate the model
correct = 0
total = 0

for index, row in enumerate(test_data):
    text = row[0]
    rating = row[1]
    rating = list(rating['cats'].keys())[list(rating['cats'].values()).index(True)]

    doc = nlp(text)
    scores = textcat_multilabel.predict([doc])
    if round(doc.cats[rating]):
        correct += 1
    total += 1
accuracy = correct / total
print(f"Accuracy: {round(accuracy, 2)}%")

In [None]:
textcat_multilabel.predict([nlp('')])