In [212]:
import pandas as pd
import re
import string
from spacy.lang.de.stop_words import STOP_WORDS
# Spacy
import spacy

In [213]:
pd.set_option('display.max_colwidth', None)
nlp = spacy.load('de_core_news_md')

In [214]:
REVIEWS_FILE_PATH = 'data/labeled_data.csv'
REVIEWS_CLEANED_FILE_PATH = 'data/labeled_data_cleaned.csv'

df = pd.read_csv(REVIEWS_FILE_PATH, sep=';')

In [215]:
df.columns = ['Index',
              'caption',
              'food_positive', 'food_negative',
              'service_positive', 'service_negative',
              'ambient_positive', 'ambient_negative',
              'price_positive', 'price_negative',
              'waiting_positive', 'waiting_negative',
              'rating']

In [216]:
df.head(3)

Unnamed: 0,Index,caption,food_positive,food_negative,service_positive,service_negative,ambient_positive,ambient_negative,price_positive,price_negative,waiting_positive,waiting_negative,rating
0,0,"Die mit Abstand leckerste Pizza in Deutschland, und das selbst wenn man sie um halb 10 abends zum mitnehmen bestellt :) Einfach zum niederknien! …",1,0,1,0,0,0,0,0,0,0,5.0
1,4,Good place to dine in! Sadly no chicken pizzas available. Do keep in mind the waiting time when you visit✌🏻 …,1,0,0,1,0,0,0,0,0,1,4.0
2,5,This is the must try pizza in Frankfurt.,1,0,0,0,0,0,0,0,0,0,5.0


In [217]:
# Remove NaN caption
USEFUL_COLUMNS = ['caption',
                  'food_positive', 'service_positive', 'ambient_positive', 'price_positive', 'waiting_positive', 'rating']
df = df.loc[~df['caption'].isnull(), USEFUL_COLUMNS]
df.reset_index(drop=True)
df.head(3)

Unnamed: 0,caption,food_positive,service_positive,ambient_positive,price_positive,waiting_positive,rating
0,"Die mit Abstand leckerste Pizza in Deutschland, und das selbst wenn man sie um halb 10 abends zum mitnehmen bestellt :) Einfach zum niederknien! …",1,1,0,0,0,5.0
1,Good place to dine in! Sadly no chicken pizzas available. Do keep in mind the waiting time when you visit✌🏻 …,1,0,0,0,0,4.0
2,This is the must try pizza in Frankfurt.,1,0,0,0,0,5.0


In [218]:
# Remove the section translated by google if present
df['caption'] = df['caption'].str.split('(Translated by Google)').str[0]
df.head(3)

Unnamed: 0,caption,food_positive,service_positive,ambient_positive,price_positive,waiting_positive,rating
0,"Die mit Abstand leckerste Pizza in Deutschland, und das selbst wenn man sie um halb 10 abends zum mitnehmen bestellt :) Einfach zum niederknien! …",1,1,0,0,0,5.0
1,Good place to dine in! Sadly no chicken pizzas available. Do keep in mind the waiting time when you visit✌🏻 …,1,0,0,0,0,4.0
2,This is the must try pizza in Frankfurt.,1,0,0,0,0,5.0


In [219]:
print (STOP_WORDS)

{'ohne', 'eigener', 'ab', 'darüber', 'ihrer', 'erstes', 'solang', 'en', 'seitdem', 'uhr', 'zehntes', 'siebte', 'grosser', 'siebentes', 'drin', 'während', 'dank', 'natürlich', 'ebenso', 'geschweige', 'achter', 'welcher', 'anderem', 'seiner', 'darfst', 'dürfen', 'große', 'sechsten', 'weniges', 'wirklich', 'irgend', 'daran', 'vor', 'daraus', 'daß', 'viele', 'solches', 'diejenigen', 'sehr', 'kannst', 'neun', 'gehabt', 'denen', 'durch', 'beim', 'derjenigen', 'außer', 'dritten', 'dieselben', 'besten', 'bis', 'gesagt', 'weitere', 'welchem', 'ja', 'einige', 'hätte', 'wegen', 'jedem', 'oben', 'sechster', 'einmal', 'also', 'gewesen', 'hinter', 'das', 'fünftes', 'weniger', 'denn', 'vom', 'zwanzig', 'dann', 'geht', 'mochten', 'kann', 'gern', 'jedoch', 'mögt', 'sagt', 'mich', 'vier', 'mit', 'darf', 'acht', 'wie', 'dabei', 'müssen', 'satt', 'eines', 'na', 'kam', 'zwar', 'solcher', 'neue', 'dermassen', 'hat', 'im', 'dasselbe', 'dahin', 'gedurft', 'übrigens', 'weil', 'ach', 'du', 'dadurch', 'erste', '

In [220]:
REGX_USERNAME = r"@[A-Za-z0-9$-_@.&+]+"
REGX_URL = r"https?://[A-Za-z0-9./]+"
def preprocessing(text):
    text = text.lower()

    # Remove
    text = re.sub(REGX_USERNAME, ' ', text)
    text = re.sub(REGX_URL, ' ', text)

    # Replace Emojis
    emojis = {
        ':)': 'positive emotionen',
        ':(': 'negative emotionen'
    }

    for e in emojis:
        text = text.replace(e, emojis[e])

    tokens = [token.text for token in nlp(text)]

    # Remove Stop Words, punctuations and words with less than 3 characters
    tokens = [t for t in tokens if
              t not in STOP_WORDS and
              t not in string.punctuation and
              len(t) > 3]

    tokens = [t for t in tokens if not t.isdigit()]

    text = " ".join(tokens);

    # Lemmatization
    allowed_postags = ["NOUN", "ADJ", "VERB", "ADV"]
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    lemmatized = " ".join(new_text).lower()

    return lemmatized

df["caption_clean"] = df["caption"].apply(preprocessing)

In [221]:
df.head(3)

Unnamed: 0,caption,food_positive,service_positive,ambient_positive,price_positive,waiting_positive,rating,caption_clean
0,"Die mit Abstand leckerste Pizza in Deutschland, und das selbst wenn man sie um halb 10 abends zum mitnehmen bestellt :) Einfach zum niederknien! …",1,1,0,0,0,5.0,abstand leckerst pizza halb abends mitnehmen bestellen positiv emotion einfach niederknien
1,Good place to dine in! Sadly no chicken pizzas available. Do keep in mind the waiting time when you visit✌🏻 …,1,0,0,0,0,4.0,chicken pizzas
2,This is the must try pizza in Frankfurt.,1,0,0,0,0,5.0,


In [222]:
# renaming dataset columns, grabbing only the cleaned captions
USEFUL_COLUMNS_AND_CLEAN = USEFUL_COLUMNS[:]
USEFUL_COLUMNS_AND_CLEAN.remove('caption') # don't select caption at this point
USEFUL_COLUMNS_AND_CLEAN.append('caption_clean') # Select caption_clean

df = df[USEFUL_COLUMNS_AND_CLEAN]

df = df.rename(columns={"caption_clean": "caption"}) #
USEFUL_COLUMNS_AND_CLEAN.append('caption') # add caption
USEFUL_COLUMNS_AND_CLEAN.remove('caption_clean') # Remove caption_clean
df.reset_index(drop=True)

Unnamed: 0,food_positive,service_positive,ambient_positive,price_positive,waiting_positive,rating,caption
0,1,1,0,0,0,5.0,abstand leckerst pizza halb abends mitnehmen bestellen positiv emotion einfach niederknien
1,1,0,0,0,0,4.0,chicken pizzas
2,1,0,0,0,0,5.0,
3,1,0,0,0,0,5.0,
4,1,0,1,0,0,5.0,leck pizz cool atmosphäre
...,...,...,...,...,...,...,...
89,1,0,0,0,0,5.0,hervorragend neapolitanisch pizza
90,1,1,1,0,1,5.0,super fruendlich atmosphäre chillig schön auswahl pizzas lecker schnell fertig wartezeit qalität leiden
91,1,0,0,0,0,5.0,leck pizza hochpreisig
92,1,0,0,0,0,5.0,


In [223]:
USEFUL_COLUMNS_AND_CLEAN

['food_positive',
 'service_positive',
 'ambient_positive',
 'price_positive',
 'waiting_positive',
 'rating',
 'caption']

In [224]:
# Remove null & empty values once again for 'caption_clean' column
df = df.loc[~df['caption'].isnull(), USEFUL_COLUMNS_AND_CLEAN]
df = df[df['caption'].str.strip().astype(bool)]

In [225]:
df.head(3)

Unnamed: 0,food_positive,service_positive,ambient_positive,price_positive,waiting_positive,rating,caption
0,1,1,0,0,0,5.0,abstand leckerst pizza halb abends mitnehmen bestellen positiv emotion einfach niederknien
1,1,0,0,0,0,4.0,chicken pizzas
4,1,0,1,0,0,5.0,leck pizz cool atmosphäre


In [226]:
df.to_csv(REVIEWS_CLEANED_FILE_PATH, sep=';')