In [7]:
import pandas as pd
import numpy as np
# SpaCy
import spacy
from spacy.util import minibatch
from spacy.training import Example

In [8]:
#nlp = spacy.load('de_core_news_md')
nlp = spacy.blank("de")

In [9]:
REVIEWS_CLEANED_FILE_PATH = 'data/labeled_data_cleaned.csv'

df = pd.read_csv(REVIEWS_CLEANED_FILE_PATH, sep=';')
df.head()

Unnamed: 0.1,Unnamed: 0,food_positive,service_positive,ambient_positive,price_positive,waiting_positive,rating,caption
0,0,1,1,0,0,0,5.0,abstand leckerste pizza deutschland halb abend...
1,1,1,0,0,0,0,4.0,good place dine sadly chicken pizzas available...
2,2,1,0,0,0,0,5.0,this must pizza frankfurt
3,3,1,0,0,0,0,5.0,pizza place frankfurt
4,4,1,0,1,0,0,5.0,leckere pizzen coole atmosphäre


In [10]:
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

In [11]:
for _, row in df.iterrows():
    text = row["caption"]
    rating = row["rating"]

    categories_columns = ['food_positive', 'food_negative',
                          'service_positive', 'service_negative',
                          'ambient_positive', 'ambient_negative',
                          'price_positive', 'price_negative',
                          'waiting_positive', 'waiting_negative']

    doc = nlp(text)

    print(row)

    for column in categories_columns:
        doc.cats[column] = row[column]

    print(doc.cats)



Unnamed: 0                                                          0
food_positive                                                       1
service_positive                                                    1
ambient_positive                                                    0
price_positive                                                      0
waiting_positive                                                    0
rating                                                            5.0
caption             abstand leckerste pizza deutschland halb abend...
Name: 0, dtype: object


KeyError: 'food_negative'

In [None]:
textcat_multilabel = nlp.add_pipe("textcat_multilabel", config={
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
}, last=True)

# Add the labels to the text classifier
textcat_multilabel.add_label('food_positive')
textcat_multilabel.add_label('service_positive')
textcat_multilabel.add_label('ambient_positive')
textcat_multilabel.add_label('price_positive')
textcat_multilabel.add_label('waiting_positive')

In [None]:
target_columns = ['food_positive', 'service_positive', 'ambient_positive', 'price_positive', 'waiting_positive']
df[target_columns].head()

In [None]:
def load_data(split=0.8):
    split = int(len(df) * split)

    X_train = df['caption'][:split]
    y_train = df[target_columns][:split]

    X_test = df['caption'][split:]
    y_test = df[target_columns][split:]

    return (X_train, y_train), (X_test, y_test)

In [None]:
# load the dataset
(X_train, y_train), (X_test, y_test) = load_data()

train_data = list(zip(X_train, [{'cats': {  'food_positive': cats['food_positive'],
                                            'service_positive': cats['service_positive'],
                                            'ambient_positive': cats['ambient_positive'],
                                            'price_positive': cats['price_positive'],
                                            'waiting_positive': cats['waiting_positive']
                                            }
                                 } for index, cats in y_train.iterrows()]))

test_data = list(zip(X_test, [{'cats': {  'food_positive': int(cats['food_positive']), 'service_positive': int(cats['service_positive']),
                                          'ambient_positive': int(cats['ambient_positive']), 'price_positive': int(cats['price_positive']),
                                          'waiting_positive': int(cats['waiting_positive']) } } for index, cats in y_test.iterrows()]))


In [None]:
train_data[:2]

### Training

In [None]:
# Train the text classifier
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()
    for i in range(100): # Iterations
        losses = {}
        np.random.shuffle(train_data)
        for batch in minibatch(train_data, size=8):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], drop=0.3, sgd=optimizer, losses=losses)

#### Genauigkeit

In [None]:
# Evaluate the model
correct = 0
total = 0
accuracies = []

for index, row in enumerate(test_data):
    text = row[0]
    annotations = row[1]

    doc = nlp(text)
    scores = textcat_multilabel.predict([doc])

    item_accuracies = []
    for i, key in enumerate(target_columns):
        isSame = annotations['cats'][key] == round(scores[0][i])
        item_accuracies.append(1 if isSame else 0)

    score_item_accuracy = np.array(item_accuracies).sum() / len(target_columns)
    accuracies.append(score_item_accuracy)

accuracy = np.array(accuracies).sum() / len(accuracies)
print(f"Accuracy: {round(accuracy * 100, 2)}%")

In [None]:
textcat_multilabel.predict([nlp('')])