In [7]:
import requests
import urllib.parse
import json
import pandas as pd
from requests_html import HTMLSession
import random



def _get_source(url: str):

    try:
        session = HTMLSession()
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
            'Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36',
            'Mozilla/5.0 (iPhone; CPU iPhone OS 14_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
        ]
        user_agent = random.choice(user_agents)
        
        session.headers.update({'User-Agent': user_agent})
        response = session.get(url)

        return response

    except requests.exceptions.RequestException as e:
        print(e)


def _get_results(query: str):

    query = urllib.parse.quote_plus(query)
    response = _get_source("https://suggestqueries.google.com/complete/search?output=chrome&hl=de&q=" + query)
    
    results = json.loads(response.text)
    return results


def _format_results(results: dict):

    if results:
        suggestions = []
        for index, value in enumerate(results[1]):
            suggestion = {'term': value, 'relevance': results[4]['google:suggestrelevance'][index]}
            suggestions.append(suggestion)
        return suggestions


def _get_suggestions(query: str):
    results = _get_results(query)
    results = _format_results(results)
    results = sorted(results, key=lambda k: k['relevance'], reverse=True)
    return results


def _get_expanded_term_suffixes():

    expanded_term_suffixes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
                              'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

    return expanded_term_suffixes


def _get_expanded_term_prefixes():

    expanded_term_prefixes = ['wer ist *', 'was ist *', 'wo ist *', 'wann kann *', 'warum ist *',
                            'wie man *', 'beste', 'günstig', 'schlechteste', 'ist', 'was', 'wann',
                            'warum', 'wie', 'wer']

    
    return expanded_term_prefixes


def _get_expanded_terms(query: str):

    expanded_term_prefixes = _get_expanded_term_prefixes()
    expanded_term_suffixes = _get_expanded_term_suffixes()

    terms = [query]

    for term in expanded_term_prefixes:
        terms.append(term + ' ' + query)

    for term in expanded_term_suffixes:
        terms.append(query + ' ' + term)

    return terms


def _get_expanded_suggestions(query: str):
    all_results = []

    expanded_terms = _get_expanded_terms(query)
    for term in expanded_terms:
        results = _get_results(term)
        results = _format_results(results)
        all_results = all_results + results
        all_results = sorted(all_results, key=lambda k: k['relevance'], reverse=True)
    return all_results


def google_autocomplete(query: str, include_expanded=True):
    if include_expanded:
        results = _get_expanded_suggestions(query)

    else:
        results = _get_suggestions(query)

    df = pd.DataFrame.from_records(results)
    return df


In [None]:
related_keywords = [ #data 1 hdbscan_model = HDBSCAN(min_cluster_size=20, min_samples=20, prediction_data=True, gen_min_span_tree=True)
   'brille',
   "brillen",
   "optiker",
   "Aigner United Ooptics",
   "United Ooptics",
   
] 

In [9]:
all_suggestions = []  

for keyword in related_keywords:
  print(keyword)
  try:
    suggestions = google_autocomplete(keyword, include_expanded=True)

    df = pd.DataFrame(suggestions)
    all_suggestions.append(df)
  except:
    continue




brillenmarke
sehhilfen
brillenvergleich
brillenmode
brillenangebot
brillen
brillenglas
fahrbrille
brilleninnovation
brillenstil
glasreparatur
modebrillen
brilleonline
brillenanpassung
sehstärke
sichtgläser
kontaktlinsenpflege
brillenfürsportler
sonnenbrille
sehhilfenfürkinder
brillenservice
optiker
brillenblog
kontaktlinsen
brillenhersteller
brillenetui
brillenauswahl
gläser
fassungen
lesebrille
optiken
lesebrillen
augenoptik
brillenverleih
sportbrillen
schutzglas
sicherheitsbrille
sonnenbrillen
brilleninspiration
brillenreview
brillenfassung
brillenkollektionen
brillenaccessoires
brillenkollektion
augenarzt
brillencommunity
schwimmbrille
optische gläser
brillengestell
brillenreiniger
gleitsichtbrille
skibrille
augenpflege
korrigierende brille
brillenzubehör
sicherheitsbrillen
sehhilfe
brillendesign
schutzbrille
brillenfarbe
kinderbrille
designerbrille
brillenrabatt
brillenputztuch
arbeitsbrille
dioptrien
brillenversicherung
brillenpromotion
lupe
linsen
brillenband
computerbrille
optik

In [34]:
big_dataframe = pd.concat(all_suggestions, ignore_index=True)


In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
import nltk
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords


# Gute Ergebnisse:
'umap_model': UMAP(min_dist=0.03, n_components=3, n_neighbors=3, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} 



'hdbscan_model': HDBSCAN(gen_min_span_tree=True, min_cluster_size=10, min_samples=5,


In [None]:
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.03)
hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=5, prediction_data=True, gen_min_span_tree=True) # Kann bearbeitet werden

In [None]:
nltk.download('stopwords')

In [None]:
stopwords = list(stopwords.words("german"))

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    language="german",
    calculate_probabilities=True,
    verbose=True
    )

topics, probs = model.fit_transform(big_dataframe["term"])

https://github.com/MaartenGr/BERTopic

TF-IDF: https://www.youtube.com/watch?v=vZAXpvHhQow&t=423s&ab_channel=DataScienceGarage

Code (BERTopic): https://www.youtube.com/watch?v=fb7LENb9eag&ab_channel=JamesBriggs
Code (API): https://github.com/practical-data-science/ecommercetools#7-get-keyword-suggestions-from-google-autocomplete
