In [274]:
import pandas as pd
import numpy as np
import re
import string
# SpaCy
import spacy
from spacy.util import minibatch
from spacy.training import Example
from spacy.lang.pt.stop_words import STOP_WORDS

In [275]:
#nlp = spacy.load('de_core_news_md')
nlp = spacy.blank("en")

In [276]:
df = pd.read_csv('data/reviews-clean.csv', sep=';')

In [277]:
REGX_USERNAME = r"@[A-Za-z0-9$-_@.&+]+"
REGX_URL = r"https?://[A-Za-z0-9./]+"
def preprocessing(text):
    text = text.lower()

    text = re.sub(REGX_USERNAME, ' ', text)
    text = re.sub(REGX_URL, ' ', text)

    emojis = {
        ':)': 'emocaopositiva',
        ':(': 'emocaonegativa'
    }

    for e in emojis:
        text = text.replace(e, emojis[e])

    tokens = [token.text for token in nlp(text)]

    tokens = [t for t in tokens if
              t not in STOP_WORDS and
              t not in string.punctuation and
              len(t) > 3]

    tokens = [t for t in tokens if not t.isdigit()]

    return " ".join(tokens)
df["caption_clean"] = df["caption"].apply(preprocessing)
df.head()

Unnamed: 0.1,Unnamed: 0,caption,rating,timestamp,caption_clean
0,0,Die mit Abstand leckerste Pizza in Deutschland...,5.0,2023-02-15 22:10:07.146586,abstand leckerste pizza deutschland selbst wen...
1,4,Good place to dine in! Sadly no chicken pizzas...,4.0,2023-02-14 00:10:07.146703,good place dine sadly chicken pizzas available...
2,5,This is the must try pizza in Frankfurt.,5.0,2023-02-14 00:10:07.146705,this must pizza frankfurt
3,6,5/5 Pizza place in Frankfurt,5.0,2023-02-14 00:10:07.146707,pizza place frankfurt
4,7,Leckere Pizzen und coole Atmosphäre. 👍🏼 (,5.0,2023-02-13 00:10:07.146710,leckere pizzen coole atmosphäre


In [278]:
from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL

In [279]:
textcat_multilabel = nlp.add_pipe("textcat_multilabel", config={
    "threshold": 0.5,
    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
}, last=True)

# Add the labels to the text classifier
textcat_multilabel.add_label('1')
textcat_multilabel.add_label('2')
textcat_multilabel.add_label('3')
textcat_multilabel.add_label('4')
textcat_multilabel.add_label('5')

1

In [280]:
def load_data(split=0.8):
    split = int(len(df) * split)

    X_train = df['caption_clean'][:split]
    y_train = df['rating'][:split]
    X_test = df['caption_clean'][split:]
    y_test = df['rating'][split:]

    return (X_train, y_train), (X_test, y_test)

In [281]:
# load the dataset
(X_train, y_train), (X_test, y_test) = load_data()

train_data = list(zip(X_train, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_train]))
test_data = list(zip(X_test, [{'cats': {"1":  cats==1, "2":  cats==2, "3":  cats==3, "4":  cats==4, "5":  cats==5}} for cats in y_test]))

In [282]:
train_data[:2]

[('abstand leckerste pizza deutschland selbst wenn halb abends mitnehmen bestellt emocaopositiva einfach niederknien',
  {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}),
 ('good place dine sadly chicken pizzas available keep mind waiting time when visit',
  {'cats': {'1': False, '2': False, '3': False, '4': True, '5': False}})]

In [283]:
# Train the text classifier
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat_multilabel']

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.initialize()
    for i in range(10): # Iterations
        np.random.shuffle(train_data)
        for batch in minibatch(train_data, size=8):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], drop=0.3, sgd=optimizer)

In [285]:
# Evaluate the model
correct = 0
total = 0

for index, row in enumerate(test_data):
    text = row[0]
    rating = row[1]
    #print(text, rating)

    doc = nlp(text)
    scores = textcat_multilabel.predict([doc])
    print(scores, rating)
    #predicted_rating = doc.cats["1"]
    #print(doc.cats, rating)
    #if int(doc.cats[rating]) == True:
    #    correct += 1
    #total += 1
#accuracy = correct / total
#print(f"Accuracy: {accuracy}")

[[1.5094861e-03 8.4847243e-06 3.5990693e-04 7.6045457e-05 9.9990261e-01]] {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}
[[2.6325665e-03 2.9654262e-04 9.8680239e-03 5.2447308e-02 4.5179033e-01]] {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}
[[2.4671812e-04 1.6436334e-05 2.1713453e-04 6.5434133e-06 9.9999619e-01]] {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}
[[3.2200979e-04 4.3943774e-06 8.4009342e-05 5.7980913e-05 9.9945515e-01]] {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}
[[2.4147630e-03 4.4320823e-05 2.1615645e-02 3.5127247e-05 9.8597461e-01]] {'cats': {'1': False, '2': False, '3': False, '4': True, '5': False}}
[[4.90932260e-04 7.86400778e-06 2.64884089e-04 1.08504886e-04
  9.99298930e-01]] {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}}
[[4.8860777e-03 1.2889511e-05 2.7216837e-04 4.9307309e-06 9.9889261e-01]] {'cats': {'1': False, '2': False, '3': False, '4': Fals