In [17]:
import pandas as pd
import numpy as np
# SpaCy
import spacy
from spacy.util import minibatch
from spacy.training import Example

In [18]:
nlp = spacy.blank("de")

In [19]:
REVIEWS_CLEANED_FILE_PATH = '../data/reviews_merged_cleaned.csv'

df = pd.read_csv(REVIEWS_CLEANED_FILE_PATH, sep=';')
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,caption,rating,food_positive,food_negative,service_positive,service_negative,ambient_positive,ambient_negative,price_positive,price_negative,waiting_positive,waiting_negative
0,0,0,mega cooles ambiente konzept bürger geschmackl...,4.0,0,0,0,0,1,0,0,0,0,0
1,1,1,super sympathische mitarbeiter freundliche gem...,5.0,1,0,0,0,1,0,0,0,0,0
2,2,2,nenne burger schicki-micki gerne burgerladen d...,5.0,0,0,0,0,0,0,0,0,0,0
3,3,3,gestern dritt selben burger bestellt gestern a...,1.0,0,0,0,0,0,0,0,0,0,0
4,4,4,fanden burger restaurant klasse individuelle m...,4.0,0,0,0,0,0,0,0,0,0,0


In [20]:
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

In [21]:
for _, row in df.iterrows():
    text = row["caption"]
    rating = row["rating"]

    target_columns = ['food_positive', 'food_negative',
                          'service_positive', 'service_negative',
                          'ambient_positive', 'ambient_negative',
                          'price_positive', 'price_negative',
                          'waiting_positive', 'waiting_negative']

    doc = nlp(text)

    for column in target_columns:
        doc.cats[column] = row[column]

In [22]:
textcat_multilabel = nlp.add_pipe("textcat_multilabel", config={
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
}, last=True)

for column in target_columns:
    # Add the labels to the text classifier
    textcat_multilabel.add_label(column)

In [23]:
df[target_columns].head()

Unnamed: 0,food_positive,food_negative,service_positive,service_negative,ambient_positive,ambient_negative,price_positive,price_negative,waiting_positive,waiting_negative
0,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0


In [24]:
def load_data(split=0.8):
    split = int(len(df) * split)

    X_train = df['caption'][:split]
    y_train = df[target_columns][:split]

    X_test = df['caption'][split:]
    y_test = df[target_columns][split:]

    return (X_train, y_train), (X_test, y_test)

In [25]:
# load the dataset
(X_train, y_train), (X_test, y_test) = load_data()

train_data = list(zip(X_train, [{'cats': {
                                            column: cats[column] for column in target_columns
                                          }
                                } for index, cats in y_train.iterrows()]))
#
test_data = list(zip(X_test, [{'cats': {
                                            column: int(cats[column]) for column in target_columns
                                       }
                              } for index, cats in y_test.iterrows()]))

In [26]:
train_data[:2]

[('mega cooles ambiente konzept bürger geschmacklich lecker bürger individuelle bedürfnisse anpassen allergien vegetarisch',
  {'cats': {'food_positive': 0,
    'food_negative': 0,
    'service_positive': 0,
    'service_negative': 0,
    'ambient_positive': 1,
    'ambient_negative': 0,
    'price_positive': 0,
    'price_negative': 0,
    'waiting_positive': 0,
    'waiting_negative': 0}}),
 ('super sympathische mitarbeiter freundliche gemütliche atmosphäre fantastisches essen sicher öfter',
  {'cats': {'food_positive': 1,
    'food_negative': 0,
    'service_positive': 0,
    'service_negative': 0,
    'ambient_positive': 1,
    'ambient_negative': 0,
    'price_positive': 0,
    'price_negative': 0,
    'waiting_positive': 0,
    'waiting_negative': 0}})]

### Training

In [27]:
# Train the text classifier
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()
    for i in range(100): # Iterations
        losses = {}
        np.random.shuffle(train_data)
        for batch in minibatch(train_data, size=8):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], drop=0.3, sgd=optimizer, losses=losses)

#### Genauigkeit

In [28]:
# Evaluate the model
correct = 0
total = 0
accuracies = []

for index, row in enumerate(test_data):
    text = row[0]
    annotations = row[1]

    doc = nlp(text)
    scores = textcat_multilabel.predict([doc])

    item_accuracies = []
    for i, key in enumerate(target_columns):
        isSame = annotations['cats'][key] == round(scores[0][i])
        item_accuracies.append(1 if isSame else 0)

    score_item_accuracy = np.array(item_accuracies).sum() / len(target_columns)
    accuracies.append(score_item_accuracy)

accuracy = np.array(accuracies).sum() / len(accuracies)
print(f"Accuracy: {round(accuracy * 100, 2)}%")

Accuracy: 98.66%


In [29]:
textcat_multilabel.predict([nlp('')])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])