In [587]:
import pandas as pd
import numpy as np
# SpaCy
import spacy
from spacy.util import minibatch
from spacy.training import Example

In [588]:
#nlp = spacy.load('de_core_news_md')
nlp = spacy.blank("de")

In [589]:
df = pd.read_csv('data/reviews-clean.csv', sep=';')
df.head()

Unnamed: 0.1,Unnamed: 0,caption,rating,timestamp
0,0,abstand leckerst pizza halb abends mitnehmen b...,5.0,2023-02-15 22:10:07.146586
1,4,chicken pizzas,4.0,2023-02-14 00:10:07.146703
2,7,leck pizz cool atmosphäre,5.0,2023-02-13 00:10:07.146710
3,11,extrem lecker empfehlung,5.0,2023-02-10 00:11:07.737920
4,13,gut pizza authentisch,5.0,2023-02-09 00:11:07.737926


In [590]:
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

In [591]:
for _, row in df.iterrows():
    text = row["caption"]
    rating = row["rating"]
    doc = nlp(text)
    #doc.cats["caption"] = rating
    doc.cats = {"1":  rating==1, "2":  rating==2, "3":  rating==3, "4":  rating==4, "5":  rating==5}

In [592]:
textcat_multilabel = nlp.add_pipe("textcat_multilabel", config={
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
}, last=True)

# Add the labels to the text classifier
textcat_multilabel.add_label('1')
textcat_multilabel.add_label('2')
textcat_multilabel.add_label('3')
textcat_multilabel.add_label('4')
textcat_multilabel.add_label('5')

1

In [593]:
def load_data(split=0.8):
    split = int(len(df) * split)

    X_train = df['caption'][:split]
    y_train = df['rating'][:split]
    X_test = df['caption'][split:]
    y_test = df['rating'][split:]

    return (X_train, y_train), (X_test, y_test)

In [594]:
# load the dataset
(X_train, y_train), (X_test, y_test) = load_data()

train_data = list(zip(X_train, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_train]))
test_data = list(zip(X_test, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_test]))

In [595]:
train_data[:2]

[('abstand leckerst pizza halb abends mitnehmen bestellen positiv emotion einfach niederknien',
  {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}),
 ('chicken pizzas',
  {'cats': {'1': False, '2': False, '3': False, '4': True, '5': False}})]

### Training

In [596]:
# Train the text classifier
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()
    for i in range(100): # Iterations
        losses = {}
        np.random.shuffle(train_data)
        for batch in minibatch(train_data, size=8):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], drop=0.3, sgd=optimizer, losses=losses)

#### Genauigkeit

In [597]:
# Evaluate the model
correct = 0
total = 0

for index, row in enumerate(test_data):
    text = row[0]
    rating = row[1]
    rating = list(rating['cats'].keys())[list(rating['cats'].values()).index(True)]

    doc = nlp(text)
    #print(doc.cats)
    scores = textcat_multilabel.predict([doc])
    if round(doc.cats[rating]):
        correct += 1
    total += 1
accuracy = correct / total
print(f"Accuracy: {round(accuracy * 100, 2)}%")

Accuracy: 64.58%


In [598]:
textcat_multilabel.predict([nlp('')])

array([[0., 0., 0., 0., 0.]])