In [72]:
import pandas as pd
from bertopic.representation import KeyBERTInspired
from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance

In [124]:
df = pd.read_csv('../data/reviews_berlin.csv')

In [34]:
df = df[df.review_en.apply(lambda x: isinstance(x, str))]

In [35]:
# split sentences at "\" as reviews seem to organized this way

all_sentences_with_index = []

for idx, row in df.review_en.items():
    # replace \
    cleaned_row = row.replace("[", "").replace("]", "").replace("\\n", "").strip()
    # Split sentences
    sentences = cleaned_row.split(", '")
    # Append each sentence with its original index
    for sentence in sentences:
        all_sentences_with_index.append((idx, sentence))


In [36]:
len(all_sentences_with_index)

13805

In [37]:
# create look up for remapping review to parkname
doc_look_up_df = pd.DataFrame(all_sentences_with_index)
cols=['orig_index', 'sentence']
doc_look_up_df.columns = cols

In [38]:
docs = list(doc_look_up_df.sentence)

#### topic modelling with BERTopic

In [85]:
#defines how a document will be split into token (ngram_range) and removes stop words
vectorizer_model = CountVectorizer(stop_words='english') #ngram_range=(1, 3)

In [86]:
topic_model = BERTopic(vectorizer_model=vectorizer_model, calculate_probabilities=True, nr_topics=50).fit(docs)

In [42]:
# cleaning  outliers using probabilities
#new_topics = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities")
#new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)

In [87]:
topic_model.visualize_barchart(top_n_topics = 50, n_words = 20)

In [141]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4499,-1_park_place_nice_berlin,"[park, place, nice, berlin, great, good, peopl...","[very nice place to relax.', ""A quite open pla..."
1,0,3423,0_park_nice_place_beautiful,"[park, nice, place, beautiful, playground, gre...","[Nice park', Nice park', Nice park']"
2,1,1135,1_berlin_park_visit_german,"[berlin, park, visit, german, place, city, cat...",[If you're looking for a peaceful escape in th...
3,2,568,2_church_building_architecture_view,"[church, building, architecture, view, beautif...","[Building , so beautiful', Beautiful building...."
4,3,547,3_ok_good_nice_cool,"[ok, good, nice, cool, beautiful, okay, gut, g...","[Everything OK', Ok', Ok ']"
5,4,417,4_market_flea_saturday_clothes,"[market, flea, saturday, clothes, sunday, stor...",[We visited the Boxhagener Platz for the Sunda...
6,5,390,5_memorial_war_cemetery_soviet,"[memorial, war, cemetery, soviet, monument, hi...",[Just a stone throw distance away from the Bra...
7,6,363,6_station_subway_train_u3,"[station, subway, train, u3, sbahn, u1, trains...",[Nice subway station ............................
8,7,280,7_service_whiskey_food_staff,"[service, whiskey, food, staff, bar, cocktails...",[beautiful bar with absolutely delicious cockt...
9,8,203,8_garden_rose_tiergarten_roses,"[garden, rose, tiergarten, roses, flowers, bea...",[The Rose Garden in Berlin's Tiergarten - An o...


In [142]:
topic_df = topic_model.get_document_info(docs)

In [143]:
topic_df = topic_df[topic_df.Topic == 10]
topic_df

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
116,Good and clean',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,1.000000,False
353,Unfortunately not very clean Platz!',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.370921,False
832,Unfortunately it is very dirty in the mornings...,10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.207275,False
852,A bit of a lot of weed but nice.',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,1.000000,False
912,Unfortunately not very clean Platz!',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.481770,False
...,...,...,...,...,...,...,...,...
12853,Unfortunately often dirty. But good for a quic...,10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.252377,False
13032,Unfortunately not very clean Platz!',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.505667,False
13062,Clean',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.310505,False
13460,Sick spot 👌',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.235130,False


### Multiple Representations 

In [73]:
main_representation = KeyBERTInspired()

aspect_model1 = PartOfSpeech("en_core_web_sm")
aspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]

representation_model = {
   "Main": main_representation,
   "Aspect1":  aspect_model1,
   "Aspect2":  aspect_model2 
}

In [77]:
topic_model = BERTopic(representation_model=representation_model, calculate_probabilities=True, nr_topics=50)

In [78]:
topic_model = topic_model.fit(docs)

In [79]:
topic_model.visualize_barchart(top_n_topics = 50, n_words = 20)

In [147]:
topic_df = topic_model.get_topic_info()

In [174]:
docs_df = topic_model.get_document_info(docs)

In [177]:
dirty_df =  docs_df[docs_df.Topic == 25]

In [178]:
dirty_df

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
264,"Nice little shop, I like to buy my flowers her...",25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,0.066114,False
1019,I took my mother and her little dog to take a ...,25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,1.0,False
1074,"Actually beautiful. However, many visitors mis...",25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,0.053231,False
1158,I love this park in Berlin. It is huge and ver...,25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,0.134576,False
3023,Watched a local men's soccer game! Looks a lit...,25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,1.0,False
3026,This is my neighborhood park. I\'ve never once...,25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,1.0,False
3027,I spent 10 days living right in front of the p...,25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,1.0,False
3031,I’m a solo traveler who was exploring Berlin. ...,25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,0.898339,False
3038,"Undoubtedly a beautiful park in the daytime, I...",25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,1.0,False
3050,"Full of dealers, very insecure. Can’t walk 2 m...",25,25_drugs_drug_dealers_aggressive,"[drugs, drug, dealers, aggressive, offered, da...",[Görlitzer park is really cool and amazing. My...,drugs - drug - dealers - aggressive - offered ...,1.0,False


In [84]:
#topic_model.get_topic_info()

### Cleansing

In [153]:
dirty_keywords = ["urine", "toilet", "rubbish", "dirty", "unfortunately", "smell", "not clean", "not very clean", "not very clean"]
clean_keywords = ["clean", "clean.", "tidy", "good", "pleasant", "fresh"]

In [154]:
def classify_document(row):
    dirty_count = sum(kw in row["Document"] for kw in dirty_keywords)
    clean_count = sum(kw in row["Document"] for kw in clean_keywords)

    if dirty_count > 0:
        return "Dirty"
    elif clean_count > 0:
        return "Clean"
    else:
        return "Dirty"

In [156]:
dirty_df["Classification"] = dirty_df.apply(classify_document, axis=1)

In [157]:
dirty_df = dirty_df[dirty_df["Classification"] == 'Dirty']

In [158]:
len(dirty_df)

101

### mapping to geom

In [161]:
doc_look_up_df['look_up'] = doc_look_up_df.index
dirty_df['look_up'] = dirty_df.index

In [None]:
merged = pd.merge(left=dirty_df, right=doc_look_up_df, on='look_up', how='inner')
merged.head(4)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,Classification,look_up,orig_index,sentence
0,Unfortunately not very clean Platz!',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.370921,False,Dirty,353,4,Unfortunately not very clean Platz!'
1,Unfortunately it is very dirty in the mornings...,10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.207275,False,Dirty,832,10,Unfortunately it is very dirty in the mornings...
2,A bit of a lot of weed but nice.',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,1.0,False,Dirty,852,10,A bit of a lot of weed but nice.'
3,Unfortunately not very clean Platz!',10,10_dirty_rubbish_clean_unfortunately,"[dirty, rubbish, clean, unfortunately, littere...","[Unfortunately quite dirty .', Unfortunately o...",dirty - rubbish - clean - unfortunately - litt...,0.48177,False,Dirty,912,12,Unfortunately not very clean Platz!'


In [131]:
df['orig_index'] = df.index

In [164]:
finaldirty_df = pd.merge(left=df, right=merged, on='orig_index', how='inner')

In [171]:
finaldirty_df.namenr.value_counts().head(15)

namenr
Platz%der%Republik                                         4
Karl-Marx-Allee%70C-F                                      4
Nordhafenpark%Ost                                          4
Annemirl-Bauer-Platz                                       4
Ottopark                                                   4
Spreeuferpromenade%zw.%Jannowitzbrücke%u.%Michaelbrücke    3
Köllnischer%Park                                           3
Traveplatz                                                 3
Hansaplatz%Süd                                             3
Fischerinsel%11%an%der%Schwimmhalle                        3
Sellerpark                                                 2
Hochmeisterplatz                                           2
Spreeuferpromenade%Holsteiner%Ufer                         2
Böcklerpark                                                2
Volkspark%am%Weinbergsweg                                  2
Name: count, dtype: int64