In [21]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
from sklearn.model_selection import train_test_split

# NLP

In [22]:
# Load data into pandas DataFrame
df = pd.read_csv('HORNBACH_Bornheim.csv')
df.head()

Unnamed: 0,date,relative_dates,year_month,stars,review
0,4 Stunden,2023-01-31 13:07:29.691694,01/2023,4,
1,1 Tag,2023-01-30 17:07:29.691694,01/2023,5,Man bekommt alles zu guten Preisen. Personal z...
2,1 Tag,2023-01-30 17:07:29.691694,01/2023,5,Sehr freundliches und sehr kompetentes Persona...
3,3 Tagen,2023-01-28 17:07:29.691694,01/2023,5,Riesen großer Baumarkt gut sortiert mit Tankst...
4,3 Tagen,2023-01-28 17:07:29.691694,01/2023,4,wir sind seit Jahren Kunden mit Profikarte und...


In [23]:
df['review'].isna().sum()

1168

In [24]:
# Removing all reviews without text and storing in separate dataframe

df_text = df.dropna(subset=['review'])
df_text['review'].isna().sum()

0

In [25]:
# Load german NLP model
nlp = spacy.load('de_core_news_sm')

In [28]:
import string
import re

def clean_text(text):
    
    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001f600-\U0001f64f"  # emoticons
        u"\U0001f300-\U0001f5ff"  # symbols & pictographs
        u"\U0001f680-\U0001f6ff"  # transport & map symbols
        u"\U0001f1e0-\U0001f1ff"  # flags (iOS)
                            "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # Remove auto-translated parts
    parts = text.split("(Übersetzt von Google) ")
    if len(parts) >= 2:
        text = parts[1]
    else:
        text = parts[0]
    parts = text.split("(Original)")
    if len(parts) >= 2:
        text = parts[0]
        
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove digits
    text = re.sub(r'\d', '', text)
    
#     # This was in an earlier version of the code but the aspect term model already creates tokens
#     # Process the text using the Spacy NLP model
#     doc = nlp(text)
    
#     # Remove stop words
#     tokens = [token for token in doc if not token.is_stop]

#     # Lemmatize only verbs, nouns, and adjectives
#     lemmatized_tokens = [token.lemma_.lower() if token.pos_ in ['VERB', 'NOUN', 'ADJ'] else token.text.lower() for token in tokens]

#     # Join the lemmatized tokens back together into a single string

    return text

df_text['review_cleaned'] = df_text['review'].apply(clean_text)
df_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['review_cleaned'] = df_text['review'].apply(clean_text)


Unnamed: 0,date,relative_dates,year_month,stars,review,review_cleaned
1,1 Tag,2023-01-30 17:07:29.691694,01/2023,5,Man bekommt alles zu guten Preisen. Personal z...,Man bekommt alles zu guten Preisen Personal zu...
2,1 Tag,2023-01-30 17:07:29.691694,01/2023,5,Sehr freundliches und sehr kompetentes Persona...,Sehr freundliches und sehr kompetentes Persona...
3,3 Tagen,2023-01-28 17:07:29.691694,01/2023,5,Riesen großer Baumarkt gut sortiert mit Tankst...,Riesen großer Baumarkt gut sortiert mit Tankst...
4,3 Tagen,2023-01-28 17:07:29.691694,01/2023,4,wir sind seit Jahren Kunden mit Profikarte und...,wir sind seit Jahren Kunden mit Profikarte und...
11,1 Woche,2023-01-24 17:07:29.691694,01/2023,5,Hab alles gefunden auch ohne Hilfe,Hab alles gefunden auch ohne Hilfe
...,...,...,...,...,...,...
2188,3 Jahren,2020-02-01 17:07:29.709706,02/2020,4,Gut,Gut
2191,3 Jahren,2020-02-01 17:07:29.709706,02/2020,5,Sehr gut,Sehr gut
2195,3 Jahren,2020-02-01 17:07:29.709706,02/2020,4,"immer wieder nützlich, leider wird das Angebot...",immer wieder nützlich leider wird das Angebot ...
2196,3 Jahren,2020-02-01 17:07:29.709706,02/2020,1,Deutschlands unfreundlichster Baumarkt,Deutschlands unfreundlichster Baumarkt


In [20]:
# # Dropping NaN's of tokenized column
# df_text = df_text.dropna(subset=['review_tokenized'])
# print(df_text['review_tokenized'].isna().sum())
# print(df_text.shape)

# Aspect based sentiment analysis

## Extract aspect terms and classify sentiments

In [29]:
from pyabsa import AspectTermExtraction as ATEPC, available_checkpoints

# you can view all available checkpoints by calling available_checkpoints()
checkpoint_map = available_checkpoints()

aspect_extractor = ATEPC.AspectExtractor('multilingual',
                                         auto_device=False,  # False means load model on CPU
                                         cal_perplexity=True,
                                         )

inference_source = list(df_text['review_cleaned'])
atepc_result = aspect_extractor.batch_predict(target_file=inference_source,  #
                                              save_result=True,
                                              print_result=True,  # print the result
                                              pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
                                              )

print(atepc_result)

[2023-02-02 12:08:17] (2.0.27) Please specify the task code, e.g. from pyabsa import TaskCodeOption
[2023-02-02 12:08:18] (2.0.27) Load aspect extractor from checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT
[2023-02-02 12:08:18] (2.0.27) config: checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT\fast_lcf_atepc.config
[2023-02-02 12:08:18] (2.0.27) state_dict: checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT\fast_lcf_atepc.state_dict
[2023-02-02 12:08:18] (2.0.27) model: None
[2023-02-02 12:08:18] (2.0.27) tokenizer: checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT\fast_lcf_atepc.tokenizer
[2023-02-02 12:08:18] (2.0.27) Set Model Device: cpu
[2023-02-02 12:08:18] (2.0.27) Device Name: Unknown


Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have 

[2023-02-02 12:12:58] (2.0.27) The results of aspect term extraction have been saved in C:\Users\a\Google Drive\IRONHACK Data Analysis Bootcamp\Final Project full\Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2023-02-02 12:12:58] (2.0.27) Example 0: Man bekommt alles zu guten <Preisen Personal:Positive Confidence:0.996209979057312> zuvorkommend <Parkplätze:Positive Confidence:0.9825524091720581> auch genügend Neuerdings eine Currybude auf dem Parkplatz Mit die beste Currywurst in Karlsruhe
[2023-02-02 12:12:58] (2.0.27) Example 1: Sehr freundliches und sehr kompetentes <Personal:Positive Confidence:0.9974580407142639> Wir informierten uns über Wandfliesen deren Möglichkeiten zur Verlegung und danach ließen wir uns über den Einbau einer Dusche beraten und wurden an beiden Stellen perfekt und fachkundig beraten
[2023-02-02 12:12:58] (2.0.27) Example 2: Riesen großer <Baumarkt:Positive Confidence:0.9972350001335144> gut sortiert mit Tankstelle
[2023-02-02 




In [30]:
# display(atepc_result)

[{'sentence': 'Man bekommt alles zu guten Preisen Personal zuvorkommend Parkplätze auch genügend Neuerdings eine Currybude auf dem Parkplatz Mit die beste Currywurst in Karlsruhe',
  'IOB': ['O',
   'O',
   'O',
   'O',
   'O',
   'B-ASP',
   'I-ASP',
   'O',
   'B-ASP',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  'tokens': ['Man',
   'bekommt',
   'alles',
   'zu',
   'guten',
   'Preisen',
   'Personal',
   'zuvorkommend',
   'Parkplätze',
   'auch',
   'genügend',
   'Neuerdings',
   'eine',
   'Currybude',
   'auf',
   'dem',
   'Parkplatz',
   'Mit',
   'die',
   'beste',
   'Currywurst',
   'in',
   'Karlsruhe'],
  'aspect': ['Preisen Personal', 'Parkplätze'],
  'position': [[6, 7], [9]],
  'sentiment': ['Positive', 'Positive'],
  'probs': [[0.0004890267737209797, 0.0033010474871844053, 0.996209979057312],
   [0.0009945188648998737, 0.01645307056605816, 0.9825524091720581]],
  'confidence': [0.996209979057312,

In [70]:
# Dataframe from the list of dictionaries
atepc = pd.DataFrame(atepc_result)

# Extracting aspects and sentiments into a new dataframe
result = pd.DataFrame({'aspect': atepc['aspect'].sum(), 'sentiment': atepc['sentiment'].sum()})

In [72]:
# result

In [73]:
# Count of the number of times each aspect has been named
counts = result['aspect'].value_counts()

# New dataframe with the aspect, count_positive, and count_negative columns (left at 0 to be filled later)
result_counts = pd.DataFrame({'aspect': counts.index, 'count_positive': 0, 'count_negative': 0})

# Filling the count_positive and count_negative columns
for i, row in result_counts.iterrows():
    aspect = row['aspect']
    positive = (result['sentiment'] == 'Positive') & (result['aspect'] == aspect)
    negative = (result['sentiment'] == 'Negative') & (result['aspect'] == aspect)
    result_counts.at[i, 'count_positive'] = positive.sum()
    result_counts.at[i, 'count_negative'] = negative.sum()

In [74]:
result_counts.head()

Unnamed: 0,aspect,count_positive,count_negative
0,Personal,68,13
1,Baumarkt,64,4
2,Beratung,53,8
3,Mitarbeiter,52,8
4,Auswahl,48,2


In [75]:
# result_counts.to_csv("result_countsBORNHEIM.csv", index=False)

### Some aspect seem to describe the same thing but are just written slightly different. Need to combine those to make a more accurate count:

In [76]:
# First replacing everything that can be broadly counted as "Customer Service"

# Dictionary that maps the aspects associated as "Customer Service"
mapping = {'Personal': 'Customer Service',
          'Beratung': 'Customer Service',
          'Mitarbeiter': 'Customer Service',
          'Service': 'Customer Service',
          'Mitarbeitern': 'Customer Service',
          'Kundenservice': 'Customer Service',
          'Mitarbeiterinnen': 'Customer Service',
          'Mitarbeiterin': 'Customer Service',
          'Berater': 'Customer Service',
          'beraten': 'Customer Service',
          'Verkäuferin': 'Customer Service',
          'Angestellte': 'Customer Service',
          'bedient': 'Customer Service',
          'Team': 'Customer Service',
          'Verkäufer': 'Customer Service',
          'Wartezeit': 'Customer Service',
          'personal': 'Customer Service',
          'warten': 'Customer Service',
          'Verpackungsservice': 'Customer Service',
          'Verkäufern': 'Customer Service',
          'Wartezeiten': 'Customer Service'}

# Replacing those aspects with "Customer Service"
result_counts['aspect'] = result_counts['aspect'].replace(mapping)

# Grouping by aspect and sum the count_positive and count_negative columns
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [77]:
# Repeating for everything that can be broadly counted as "Price"
mapping = {'Preise': 'Price',
          'Preis': 'Price',
          'Preisen': 'Price',
          'preislich': 'Price'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [78]:
# Repeating for everything that can be broadly counted as "Market"
mapping = {'Baumarkt': 'Market',
          'Hornbach': 'Market',
          'Laden': 'Market',
          'Markt': 'Market',
          'Platz': 'Market',
          'Gänge': 'Market',
          'baumarkt': 'Market',
          'Geschäft': 'Market',
          'Atmosphäre': 'Market'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [79]:
# Repeating for everything that can be broadly counted as "Product Line"
mapping = {'Auswahl': 'Product Line',
          'Sortiment': 'Product Line',
          'Angebot': 'Product Line',
          'Angebote': 'Product Line',
          'Qualität': 'Product Line',
          'Produkte': 'Product Line',
          'Waren': 'Product Line',
          'Produkten': 'Product Line',
          'AuswahlInsgesamt': 'Product Line',
           'Werkzeug': 'Product Line'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [80]:
# Repeating for everything that can be broadly counted as "Parking"
mapping = {'Parkplätze': 'Parking',
          'Parkplatz': 'Parking',
          'Parkmöglichkeiten': 'Parking'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [81]:
# Repeating for everything that can be broadly counted as "Gas station"
mapping = {'Tankstelle': 'Gas station',
          'Sprit': 'Gas station',
          'Tanken': 'Gas station',
          'tanken': 'Gas station',
          'Kraftstoff': 'Gas station',
          'Spritpreise': 'Gas station',
          'Tanke': 'Gas station'}
result_counts['aspect'] = result_counts['aspect'].replace(mapping)
result_grouped = result_counts.groupby('aspect').sum().reset_index()

In [82]:
result_grouped = result_grouped.sort_values(by='count_positive', ascending=False)

In [83]:
pd.set_option('display.max_rows', None)
result_grouped

Unnamed: 0,aspect,count_positive,count_negative
53,Customer Service,235,53
130,Market,114,11
175,Product Line,94,5
174,Price,54,2
90,Gas station,26,0
153,Parking,15,2
98,Handwerker,6,1
126,Leute,5,0
213,Ware,4,0
1,Abholung,4,1


In [84]:
# result_grouped.to_csv('result_groupedBORNHEIM.csv', index=False)