In [21]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
from sklearn.model_selection import train_test_split

# NLP

In [22]:
# Load data into pandas DataFrame
df = pd.read_csv('HORNBACH_Bornheim.csv')
df.head()

Unnamed: 0,date,relative_dates,year_month,stars,review
0,4 Stunden,2023-01-31 13:07:29.691694,01/2023,4,
1,1 Tag,2023-01-30 17:07:29.691694,01/2023,5,Man bekommt alles zu guten Preisen. Personal z...
2,1 Tag,2023-01-30 17:07:29.691694,01/2023,5,Sehr freundliches und sehr kompetentes Persona...
3,3 Tagen,2023-01-28 17:07:29.691694,01/2023,5,Riesen großer Baumarkt gut sortiert mit Tankst...
4,3 Tagen,2023-01-28 17:07:29.691694,01/2023,4,wir sind seit Jahren Kunden mit Profikarte und...


In [23]:
df['review'].isna().sum()

1168

In [24]:
# Removing all reviews without text and storing in separate dataframe

df_text = df.dropna(subset=['review'])
df_text['review'].isna().sum()

0

In [25]:
# Load german NLP model
nlp = spacy.load('de_core_news_sm')

In [28]:
import string
import re

def clean_text(text):
    
    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001f600-\U0001f64f"  # emoticons
        u"\U0001f300-\U0001f5ff"  # symbols & pictographs
        u"\U0001f680-\U0001f6ff"  # transport & map symbols
        u"\U0001f1e0-\U0001f1ff"  # flags (iOS)
                            "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # Remove auto-translated parts
    parts = text.split("(Übersetzt von Google) ")
    if len(parts) >= 2:
        text = parts[1]
    else:
        text = parts[0]
    parts = text.split("(Original)")
    if len(parts) >= 2:
        text = parts[0]
        
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)
    
#     # This was in an earlier version of the code but the aspect term model already creates tokens
#     # Process the text using the Spacy NLP model
#     doc = nlp(text)
    
#     # Remove stop words
#     tokens = [token for token in doc if not token.is_stop]

#     # Lemmatize only verbs, nouns, and adjectives
#     lemmatized_tokens = [token.lemma_.lower() if token.pos_ in ['VERB', 'NOUN', 'ADJ'] else token.text.lower() for token in tokens]

#     # Join the lemmatized tokens back together into a single string

    return text

df_text['review_cleaned'] = df_text['review'].apply(clean_text)
df_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['review_cleaned'] = df_text['review'].apply(clean_text)


Unnamed: 0,date,relative_dates,year_month,stars,review,review_cleaned
1,1 Tag,2023-01-30 17:07:29.691694,01/2023,5,Man bekommt alles zu guten Preisen. Personal z...,Man bekommt alles zu guten Preisen Personal zu...
2,1 Tag,2023-01-30 17:07:29.691694,01/2023,5,Sehr freundliches und sehr kompetentes Persona...,Sehr freundliches und sehr kompetentes Persona...
3,3 Tagen,2023-01-28 17:07:29.691694,01/2023,5,Riesen großer Baumarkt gut sortiert mit Tankst...,Riesen großer Baumarkt gut sortiert mit Tankst...
4,3 Tagen,2023-01-28 17:07:29.691694,01/2023,4,wir sind seit Jahren Kunden mit Profikarte und...,wir sind seit Jahren Kunden mit Profikarte und...
11,1 Woche,2023-01-24 17:07:29.691694,01/2023,5,Hab alles gefunden auch ohne Hilfe,Hab alles gefunden auch ohne Hilfe
...,...,...,...,...,...,...
2188,3 Jahren,2020-02-01 17:07:29.709706,02/2020,4,Gut,Gut
2191,3 Jahren,2020-02-01 17:07:29.709706,02/2020,5,Sehr gut,Sehr gut
2195,3 Jahren,2020-02-01 17:07:29.709706,02/2020,4,"immer wieder nützlich, leider wird das Angebot...",immer wieder nützlich leider wird das Angebot ...
2196,3 Jahren,2020-02-01 17:07:29.709706,02/2020,1,Deutschlands unfreundlichster Baumarkt,Deutschlands unfreundlichster Baumarkt


# Aspect based sentiment analysis

## Extract aspect terms and classify sentiments

In [1]:
# from pyabsa import AspectTermExtraction as ATEPC, available_checkpoints

# # you can view all available checkpoints by calling available_checkpoints()
# checkpoint_map = available_checkpoints()

# aspect_extractor = ATEPC.AspectExtractor('multilingual',
#                                          auto_device=False,  # False means load model on CPU
#                                          cal_perplexity=True,
#                                          )

# inference_source = list(df_text['review_cleaned'])
# atepc_result = aspect_extractor.batch_predict(target_file=inference_source,  #
#                                               save_result=True,
#                                               print_result=True,  # print the result
#                                               pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
#                                               )

# print(atepc_result)

In [2]:
# display(atepc_result)

In [70]:
# Dataframe from the list of dictionaries
atepc = pd.DataFrame(atepc_result)

# Extracting aspects and sentiments into a new dataframe
result = pd.DataFrame({'aspect': atepc['aspect'].sum(), 'sentiment': atepc['sentiment'].sum()})

In [72]:
# result

In [73]:
# Count of the number of times each aspect has been named
counts = result['aspect'].value_counts()

# New dataframe with the aspect, count_positive, and count_negative columns (left at 0 to be filled later)
result_counts = pd.DataFrame({'aspect': counts.index, 'count_positive': 0, 'count_negative': 0})

# Filling the count_positive and count_negative columns
for i, row in result_counts.iterrows():
    aspect = row['aspect']
    positive = (result['sentiment'] == 'Positive') & (result['aspect'] == aspect)
    negative = (result['sentiment'] == 'Negative') & (result['aspect'] == aspect)
    result_counts.at[i, 'count_positive'] = positive.sum()
    result_counts.at[i, 'count_negative'] = negative.sum()

In [74]:
result_counts.head()

Unnamed: 0,aspect,count_positive,count_negative
0,Personal,68,13
1,Baumarkt,64,4
2,Beratung,53,8
3,Mitarbeiter,52,8
4,Auswahl,48,2


In [75]:
# result_counts.to_csv("result_countsBORNHEIM.csv", index=False)

### Some aspect seem to describe the same thing but are just written slightly different. Need to combine those to make a more accurate count:

In [76]:
# First replacing everything that can be broadly counted as "Customer Service"

# Dictionary that maps the aspects associated as "Customer Service"
mapping = {'Personal': 'Customer Service',
          'Beratung': 'Customer Service',
          'Mitarbeiter': 'Customer Service',
          'Service': 'Customer Service',
          'Mitarbeitern': 'Customer Service',
          'Kundenservice': 'Customer Service',
          'Mitarbeiterinnen': 'Customer Service',
          'Mitarbeiterin': 'Customer Service',
          'Berater': 'Customer Service',
          'beraten': 'Customer Service',
          'Verkäuferin': 'Customer Service',
          'Angestellte': 'Customer Service',
          'bedient': 'Customer Service',
          'Team': 'Customer Service',
          'Verkäufer': 'Customer Service',
          'Wartezeit': 'Customer Service',
          'personal': 'Customer Service',
          'warten': 'Customer Service',
          'Verpackungsservice': 'Customer Service',
          'Verkäufern': 'Customer Service',
          'Wartezeiten': 'Customer Service'}

# Replacing those aspects with "Customer Service"
result_counts['aspect'] = result_counts['aspect'].replace(mapping)

# Grouping by aspect and sum the count_positive and count_negative columns
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [77]:
# Repeating for everything that can be broadly counted as "Price"
mapping = {'Preise': 'Price',
          'Preis': 'Price',
          'Preisen': 'Price',
          'preislich': 'Price'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [78]:
# Repeating for everything that can be broadly counted as "Market"
mapping = {'Baumarkt': 'Market',
          'Hornbach': 'Market',
          'Laden': 'Market',
          'Markt': 'Market',
          'Platz': 'Market',
          'Gänge': 'Market',
          'baumarkt': 'Market',
          'Geschäft': 'Market',
          'Atmosphäre': 'Market'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [79]:
# Repeating for everything that can be broadly counted as "Product Line"
mapping = {'Auswahl': 'Product Line',
          'Sortiment': 'Product Line',
          'Angebot': 'Product Line',
          'Angebote': 'Product Line',
          'Qualität': 'Product Line',
          'Produkte': 'Product Line',
          'Waren': 'Product Line',
          'Produkten': 'Product Line',
          'AuswahlInsgesamt': 'Product Line',
           'Werkzeug': 'Product Line'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [80]:
# Repeating for everything that can be broadly counted as "Parking"
mapping = {'Parkplätze': 'Parking',
          'Parkplatz': 'Parking',
          'Parkmöglichkeiten': 'Parking'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [81]:
# Repeating for everything that can be broadly counted as "Gas station"
mapping = {'Tankstelle': 'Gas station',
          'Sprit': 'Gas station',
          'Tanken': 'Gas station',
          'tanken': 'Gas station',
          'Kraftstoff': 'Gas station',
          'Spritpreise': 'Gas station',
          'Tanke': 'Gas station'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [82]:
result_grouped = result_grouped.sort_values(by='count_positive', ascending=False)

In [3]:
# pd.set_option('display.max_rows', None)
# result_grouped

In [84]:
# result_grouped.to_csv('result_groupedBORNHEIM.csv', index=False)