In [575]:
import pandas as pd
import numpy as np
# SpaCy
import spacy
from spacy.util import minibatch
from spacy.training import Example

In [576]:
#nlp = spacy.load('de_core_news_md')
nlp = spacy.blank("de")

In [577]:
df = pd.read_csv('data/reviews-clean.csv', sep=';')
df.head()

Unnamed: 0.1,Unnamed: 0,caption,rating,timestamp
0,0,abstand leckerst pizza halb abends mitnehmen b...,5.0,2023-02-15 22:10:07.146586
1,4,chicken pizzas,4.0,2023-02-14 00:10:07.146703
2,7,leck pizz cool atmosphäre,5.0,2023-02-13 00:10:07.146710
3,11,extrem lecker empfehlung,5.0,2023-02-10 00:11:07.737920
4,13,gut pizza authentisch,5.0,2023-02-09 00:11:07.737926


In [578]:
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

In [579]:
for _, row in df.iterrows():
    text = row["caption"]
    rating = row["rating"]
    doc = nlp(text)
    #doc.cats["caption"] = rating
    doc.cats = {"1":  rating==1, "2":  rating==2, "3":  rating==3, "4":  rating==4, "5":  rating==5}

In [580]:
textcat_multilabel = nlp.add_pipe("textcat_multilabel", config={
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
}, last=True)

# Add the labels to the text classifier
textcat_multilabel.add_label('1')
textcat_multilabel.add_label('2')
textcat_multilabel.add_label('3')
textcat_multilabel.add_label('4')
textcat_multilabel.add_label('5')

1

In [581]:
def load_data(split=0.8):
    split = int(len(df) * split)

    X_train = df['caption'][:split]
    y_train = df['rating'][:split]
    X_test = df['caption'][split:]
    y_test = df['rating'][split:]

    return (X_train, y_train), (X_test, y_test)

In [582]:
# load the dataset
(X_train, y_train), (X_test, y_test) = load_data()

train_data = list(zip(X_train, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_train]))
test_data = list(zip(X_test, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_test]))

In [583]:
train_data[:2]

[('abstand leckerst pizza halb abends mitnehmen bestellen positiv emotion einfach niederknien',
  {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}),
 ('chicken pizzas',
  {'cats': {'1': False, '2': False, '3': False, '4': True, '5': False}})]

### Training

In [584]:
# Train the text classifier
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()
    for i in range(100): # Iterations
        losses = {}
        np.random.shuffle(train_data)
        for batch in minibatch(train_data, size=8):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], drop=0.3, sgd=optimizer, losses=losses)

#### Genauigkeit

In [585]:
# Evaluate the model
correct = 0
total = 0

for index, row in enumerate(test_data):
    text = row[0]
    rating = row[1]
    rating = list(rating['cats'].keys())[list(rating['cats'].values()).index(True)]

    doc = nlp(text)
    print(doc.cats)
    scores = textcat_multilabel.predict([doc])
    if round(doc.cats[rating]):
        correct += 1
    total += 1
accuracy = correct / total
print(f"Accuracy: {round(accuracy * 100, 2)}%")

{'1': 1.4676971659355331e-05, '2': 7.917142283986323e-06, '3': 0.07266242057085037, '4': 0.0008614027756266296, '5': 0.8925357460975647}
{'1': 0.010160152800381184, '2': 0.002986459992825985, '3': 5.266025982564315e-05, '4': 0.017463507130742073, '5': 0.886347770690918}
{'1': 0.9863508343696594, '2': 6.900283187860623e-05, '3': 0.0001220577978529036, '4': 0.014559361152350903, '5': 0.0006258675130084157}
{'1': 3.1796530493011232e-06, '2': 5.58325627935119e-05, '3': 1.8033915694104508e-06, '4': 7.767032911942806e-06, '5': 0.9999977350234985}
{'1': 2.0004181351396255e-06, '2': 2.7380788196751382e-06, '3': 3.1477514994548983e-07, '4': 3.6054323572898284e-05, '5': 0.9999998807907104}
{'1': 1.645936026761774e-05, '2': 1.7894115444505587e-05, '3': 3.347459596625413e-06, '4': 0.0005127297481521964, '5': 0.999276340007782}
{'1': 0.00010307469347026199, '2': 1.3763400374955381e-06, '3': 1.7968999600270763e-05, '4': 0.9985470175743103, '5': 1.58782668790991e-07}
{'1': 1.744701307870855e-06, '2':

In [586]:
textcat_multilabel.predict([nlp('')])

array([[0., 0., 0., 0., 0.]])