In [474]:
import pandas as pd
import os
import ast
import string
import re
from deep_translator import GoogleTranslator
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from openai import OpenAI
from langchain.prompts import PromptTemplate
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from gensim import models, corpora
import pyLDAvis.gensim
import unidecode

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

### Labeling

In [333]:
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')

In [13]:
prompt_template = PromptTemplate.from_template(
    '''
    Your task is to extract keywords from {sentence}.
    You are an assistant for labeling negative annotaded keywords in user reviews.
    Return the words that are assumed to be negative words in form of a list.
    Note: If now negative words detected reutrn empty list.
    Note: Words have to stand alone as negative to be included. No fill words.
    Note: Only words that are explicit in the sentence should be included.
    Provide output without further text information. Use the following schema ['keyword 1', 'keyword 2', ...]
    '''
)


In [327]:
prompt_template_2 = PromptTemplate.from_template(
    '''
    Your task is to extract keywords from {sentence}.
    You are an assistant for labeling negative annotaded keywords in user reviews.
    Return the sentences with those words that are assumed to be negative words in form of a list.
    Provide output without further text information. Use the following schema ['sentence 1', 'sentence 2', ...]
    '''
)

In [334]:
client = OpenAI()

In [328]:
def extract_keywords(sentence: str) -> str:
    try:
        # Format the prompt dynamically with the input sentence
        prompt = prompt_template_2.format(sentence=sentence)
        
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"Error: {e}"

#### prepare keyword strings

In [454]:
def remove_stopwords(text: str, sw = stopwords.words('german')) -> str:
    '''
    This function will remove stopwords from the text
    
    Args:
        text: String of data you want to remove stopwords from
        sw: List of strings indicating the list of stopwords
        
    Returns:
        The input string with the stopwords removed.
    '''
    additional_sw = [
    'we', 'are', 'the', 'a', 'and', 'is', 'of', 'you', 'it', 'there', 'but',
    'at', 'for', 'be', 'as', 'with', 'have', 'on', 'only', 'in', 'this', 'that',
    'to', 'an', 'or', 'so', 'if', 'can', 'from', 'about', 'by', 'has', 'was',
    'not', 'they', 'their', 'them', 'your', 'which', 'do', 'did', 'does', 'how',
    'what', 'when', 'where', 'who', 'whom', 'why', 'all', 'any', 'our', 'us',
    'also', 'his', 'her', 'she', 'he', 'its', 'my', 'mine', 'yours', 'then',
    'more', 'less', 'very', 'most', 'some', 'few', 'being', 'into', 'over', 'under',
    'while', 'during', 'before', 'after', 'no', 'yes', 'each', 'other', 'out',
    'up', 'down', 'off', 'new', 'same', 'again', 'these', 'those', 'such',
    'just', 'now', 'like', 'here', 'where', 'therefore', 'hence', 'thus', 'yet'
]

    sw = sw + additional_sw
    
    text_list = text.split()
    text_list = [word for word in text_list if word.lower() not in sw]
    return ' '.join(text_list)

In [455]:
def remove_punctuation(text: str, punct: str = string.punctuation) -> str:
    '''
    This function will remove punctuations from the text.
    
    Args:
        text: String of data you want to remove punctuations from
        punct: String of punctuations
    
    Returns:
        The input string with the punctuations removed.
    '''
    cleaned_text = ''.join([char for char in text if char not in punct])
    return cleaned_text

In [456]:
def unicode(text: str) -> str:
    '''
    This function will make all the data unicoded. Meaning Â -> A
    
    Args:
        text: String of data you want to unicode
    
    Returns:
        The input string unicoded.
    '''
    return unidecode.unidecode(text)

In [457]:
def clean(text: str) -> str:
    '''
    This method will clean the input text through unidecoding and stopword and punctuation 
    removal.
    
    Args:
        text: String indicating the body of text you want to clean
    
    Returns:
        A string corresponding to the cleaned version of the input string.
    '''
    text = unicode(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    return text.lower()

In [393]:
def translate_by_division(row, source_lang='de', target_lang='en', num_parts=10):

    if not isinstance(row, str):
        return row
    num_parts = max(num_parts, 1)
    num_parts = min(num_parts, len(row)) if len(row) > 0 else 1
    part_length = max(len(row) // num_parts, 1)

    chunks = [row[i:i+part_length] for i in range(0, len(row), part_length)]
    
    translated_chunks = []
    for chunk in chunks:
        try:
            translated_chunk = GoogleTranslator(source=source_lang, target=target_lang).translate(chunk)
            if translated_chunk is not None:  #
                translated_chunks.append(translated_chunk)
        except Exception as e:
            continue  
    
    return ' '.join(translated_chunks)


In [373]:
df.drop('Unnamed: 0', inplace=True, axis=1)

In [375]:
df['labeled_sentences'] = df.review_text.apply(lambda x: extract_keywords(x))

In [387]:
df['labeled_sentences_clean'] = df.labeled_sentences.apply(lambda x: clean(x))

In [388]:
df['labeled_sentences_clean'] 

0      just expecting a garden but my opinion it is m...
1      gruselig laut mal verkehrs chaos seit uber 2 j...
2      wish people would use graffiti but they do its...
3      json obwohl vorderen bereich einfahrt unangene...
4      json very clean but ok 4 benches and sometimes...
                             ...                        
278                                                     
279    geliebte mutti nachdem haus birkholz berlin 3 ...
280    my only critique that the service particularly...
281    this next issue that had with this specific gu...
282    the amount of food comparison to price could b...
Name: labeled_sentences_clean, Length: 283, dtype: object

In [389]:
df['labeled_sentences_clean_en'] = df['labeled_sentences_clean'].apply(
    lambda row: translate_by_division(row, source_lang='de', target_lang='en')
)

#### topic clustering of single keywords

In [2]:
df = pd.read_csv('../data/processed/review_keywords.csv')

In [3]:
df.drop(columns='Unnamed: 0', axis=1, inplace=True)

In [4]:
def convert(value):

    if isinstance(value, list):
        return value
    if isinstance(value, str):
        try:
            return ast.literal_eval(value)
        except (ValueError, SyntaxError):
            return []
    return []

In [5]:
df['keywords'] = df['keywords'].apply(convert)
df.head(2)

Unnamed: 0,namenr,place_ids,lat,lon,geometry,review_text,cleaned_text,review_text_english,keywords
0,Schlosspark%Charlottenburg,ChIJLQhWKi5RqEcRuLWq4wG9HDE,52.523866,13.292494,POINT (13.2924939 52.523866399999996),"['One of Berlin’s treasures, what i like about...",one of berlins treasures what like about this ...,one of berlin's treasures what like about this...,"[disappointed, expensive, nothing, dull, tired..."
1,Sophie-Charlotten-Platz%-%GA,ChIJsXdQFdhQqEcRnq4gT95JHW0,52.51117,13.29679,POINT (13.29679 52.51117),"[""Sophie-Charlotte-Platz metro station in Berl...",sophiecharlotteplatz metro station berlin is i...,sophiecharlotteplatz metro station berlin is i...,"[loud, chaos, construction, impossible, failur..."


In [297]:
all_keywords = []

for keywords in df['keywords']:
    if isinstance(keywords, list):  # Check if the value is a list
        all_keywords.extend(keywords)

In [298]:
remove = ['expensive', 'nothing', 'unfortunately', 'closed', 'bad', 'strange', 'strange', 'none', 'missing', 'weeds', 'sad', 'broken','caution', 'annoying', 'unpleasant', 'hate', 'unfriendly', 'boring', 'missleading', 'problem', 'reckless', 'awfull', 'burning', 'off', 'dog', 'awful', 'misleading', 'missleading', 'misleading', 'dying', 'wrong','incompetent', 'overpriced', 'unusable', 'littered', 'disappointing', 'neglected', 'cyclists']

In [299]:
all_keywords = [item for item in all_keywords if item not in remove]

In [300]:
all_keywords = ['overcrowded' if word == 'crowded' else word for word in all_keywords]

##### LDA approach

In [263]:
tokenized_keywords = [nltk.word_tokenize(keyword) for keyword in all_keywords if isinstance(keyword, str)]
tokenized_keywords = [[token for token in sublist] for sublist in tokenized_keywords]
dictionary = corpora.Dictionary(tokenized_keywords)
corpus = [dictionary.doc2bow(keywords) for keywords in tokenized_keywords]

In [266]:
lda = models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=5)

In [267]:
lda_vis = pyLDAvis.gensim.prepare(
    lda,
    corpus, 
    dictionary=lda.id2word,
    mds='mmds',
    sort_topics=False
)

pyLDAvis.display(lda_vis)

In [253]:
lda.save('../models/lda_model_2.model')


#### clustering

In [469]:
keywords = all_keywords

In [486]:
keywords_df = pd.DataFrame(keywords)

In [488]:
keywords_df.to_csv('../data/processed/keyword_list.csv')

In [475]:
# Step 1: Generate embeddings using Sentence Transformers
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(keywords)

"""# Step 2: Perform clustering (e.g., K-Means)
kmeans = KMeans(n_clusters=4, random_state=0)
labels = kmeans.fit_predict(embeddings)"""

"""# Visualize clusters using t-SNE
# Set perplexity to less than the number of samples (e.g., len(keywords) - 1)
tsne = TSNE(n_components=2, perplexity=min(len(keywords) - 1, 30), random_state=0)
reduced_embeddings = tsne.fit_transform(embeddings)"""


'# Visualize clusters using t-SNE\n# Set perplexity to less than the number of samples (e.g., len(keywords) - 1)\ntsne = TSNE(n_components=2, perplexity=min(len(keywords) - 1, 30), random_state=0)\nreduced_embeddings = tsne.fit_transform(embeddings)'

In [480]:
kmeans.cluster_centers_

array([[-0.01978112,  0.03203402, -0.00376411, ..., -0.00611421,
         0.00899651,  0.0069945 ],
       [-0.00693397,  0.0245054 , -0.01230479, ...,  0.01641135,
         0.05145364,  0.01716905],
       [-0.06215139, -0.00294598,  0.06928255, ...,  0.02259812,
         0.01158603, -0.02618628],
       [ 0.00372812,  0.02690808, -0.01048445, ..., -0.02737948,
        -0.00242622,  0.0368369 ]], dtype=float32)

In [476]:
centroids = kmeans.cluster_centers_

top_keywords_per_cluster = {}
for cluster_id in range(kmeans.n_clusters):
    cluster_indices = [i for i, label in enumerate(labels) if label == cluster_id]
    distances = [np.linalg.norm(embeddings[i] - centroids[cluster_id]) for i in cluster_indices]
    sorted_indices = np.argsort(distances)
    top_keywords = [keywords[cluster_indices[i]] for i in sorted_indices[:1000]]  # Top 10 keywords
    top_keywords_per_cluster[cluster_id] = top_keywords

Top keywords for Cluster 0:
horror, horror, angry, angry, angry, destroyed, destroyed, destroyed, destroyed, destroyed, destroyed, disaster, disaster, disaster, corpse, trouble, dead, dead, dead, lonely, lonely, confused, confused, confused, pain, pain, pain, burnt, dark, dark, dark, dark, dark, dark, dark, dark, dark, dark, dark, wrecked, difficult, difficult, difficult, difficult, difficult, difficult, difficult, difficult, difficult, difficult, difficult, difficult, intense, failure, failure, failure, failure, failure, struggle, struggle, burned, burned, wasted, stole, stole, fatal, stressful, abused, abused, abused, hostile, hostile, confusing, confusing, confusing, confusing, confusing, tired, confusion, attack, stale, depressed, torn, painful, slow, slow, devastation, messy, messy, messy, messy, messy, messy, unsuitable, discarded, unnecessary, plain, damaged, damaged, damaged, damaged, damaged, damaged, damaged, damaged, miserable, miserable, exposed, damage, rotten, self, overg

In [477]:
def remove_duplicates_from_clusters(cluster_keywords):
    """
    Removes duplicate keywords from each cluster while maintaining the structure.
    
    :param cluster_keywords: Dictionary where keys are cluster IDs and values are lists of keywords.
    :return: Cleaned dictionary with unique keywords per cluster.
    """
    cleaned_clusters = {cluster_id: list(set(keywords)) for cluster_id, keywords in cluster_keywords.items()}
    return cleaned_clusters

In [482]:
clusters = remove_duplicates_from_clusters(top_keywords_per_cluster)

In [484]:
clusters[1]

['faeces',
 'bottle caps',
 'punishment',
 'stench',
 'bikes',
 'traffic noise',
 'screaming',
 'dust',
 'crowd',
 'unsafe',
 'danger',
 'fraud',
 'crimes',
 'illegal',
 'fungus',
 'cancer',
 'addicted',
 'junkes',
 'deterrent',
 'fire',
 'syringes',
 'beer bottles',
 'needle',
 'alcoholic',
 'poop',
 'drug dealers',
 'rat',
 'interference',
 'virus',
 'security',
 'toilet',
 'litter',
 'cigarette',
 'smoked',
 'protest',
 'disturbing',
 'rats',
 'murder',
 'cleaner',
 'water',
 'thieves',
 'weed',
 'poison',
 'claustrophobia',
 'nuisance',
 'anxiety',
 'junkies',
 'crack',
 'intimidating',
 'stoners',
 'opponents',
 'dealers',
 'insomnia',
 'distracts',
 'vandalism',
 'aggressive',
 'smoking',
 'smell',
 'smells',
 'police',
 'avoid',
 'safety',
 'toxic',
 'intrusion',
 'stress',
 'drunkards',
 'diarrhea',
 'polluted',
 'war',
 'thwart',
 'sugar',
 'drunks',
 'anger',
 'shouting',
 'insects',
 'drug',
 'punks',
 'prison',
 'thefts',
 'pollution',
 'smelling',
 'asthma',
 'alarming',
 