In [1]:
import geopandas as gpd
import pandas as pd
from datetime import date
from bertopic import BERTopic
from umap import UMAP
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer

gdf = gpd.read_file("F:/Coding/sloyka/all_parsed_11_04_2024_parser_1.1_fixed.geojson")

gdf.text = gdf.text.astype(str)
gdf.date = gdf.date.dt.date

embedding_model = pipeline("feature-extraction", model="cointegrated/rubert-tiny2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model)

# Define the start and end date
start_date = pd.to_datetime('2024-03-01')
end_date = pd.to_datetime('2024-03-05')
INACTIVITY_PERIOD = 7
# Generate a date range
date_range = pd.date_range(start_date, end_date, freq='D')

docs = []
global_docs = []
df_list = []
outlier_list = []
df_outliers = None
global_model = None
previous_topics = None
# Loop through each day in the date range
for single_date in date_range:
    print(single_date)
    # Filter DataFrame for the current date
    current_df = gdf[gdf['date'] == single_date.date()]
    if df_outliers is not None:
        current_df = pd.concat([current_df, df_outliers])
    docs = current_df.text.to_list()
    
    # Create clusters for the current date
    current_topic_model = BERTopic(embedding_model=embedding_model, umap_model=umap_model).fit(docs)
    if single_date.date() == date_range[0].date():
        # Global model is current model in the beginning of the period 
        global_model = current_topic_model
    else:
        # Global model is merged from previous and current models
        global_model = BERTopic.merge_models([global_model, current_topic_model], min_similarity=0.9)
    
    # Create df with clusters for current model
    current_df = current_topic_model.get_document_info(docs, current_df)
    # Filter outlier texts into separate df
    df_outliers = current_df[current_df['Topic'] == -1].drop(columns=[
        'Document', 'Topic', 'Name', 'Representation', 'Representative_Docs', 
        'Top_n_words', 'Probability', 'Representative_document'])
    current_df = current_df[current_df["Topic"] != -1]
    # Add current df without outliers to the list
    df_list.append(current_df)
# Create final df with outliers left
df_list.append(df_outliers)
df_clusters = pd.concat(df_list)
# Add final topics
final_topics = global_model.get_topic_info()
final_topics.index = final_topics.Name
df_clusters.drop(columns=['Topic'], inplace=True)
df_clusters = df_clusters.join(final_topics['Topic'], on='Name')
df_clusters.Topic = df_clusters.Topic.fillna(-1).astype(int)
df_clusters.drop(columns=['Document', 'Top_n_words'], inplace=True)

In [2]:
import geopandas as gpd
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from transformers.pipelines import pipeline

class TopicModeler:
    def __init__(self, gdf, start_date, end_date, embedding_model_name="cointegrated/rubert-tiny2"):
        self.gdf = gdf
        self.gdf.text = self.gdf.text.astype(str)
        self.gdf.date = self.gdf.date.dt.date
        
        self.embedding_model = pipeline("feature-extraction", model=embedding_model_name)
        self.umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
        self.date_range = pd.date_range(start_date, end_date, freq='D')
        self.global_model = None
        self.df_outliers = None

    def process_topics(self):
        df_list = []
        outlier_list = []

        for single_date in self.date_range:
            # Loop through each day in the date range
            print(single_date)
            # Filter DataFrame for the current date
            current_df = self.filter_data_by_date(single_date.date())
            if self.df_outliers is not None:
                current_df = pd.concat([current_df, self.df_outliers])
            docs = current_df.text.to_list()
            # Create clusters for the current date
            current_topic_model = self.create_topic_model(docs)

            if single_date.date() == self.date_range[0].date():
                self.global_model = current_topic_model
            else:
                self.global_model = BERTopic.merge_models([self.global_model, current_topic_model], min_similarity=0.9)
            
            current_df, self.df_outliers = self.handle_clusters(current_df, current_topic_model, docs)
            df_list.append(current_df)

        # Create final df with outliers left
        df_list.append(self.df_outliers)
        df_clusters = pd.concat(df_list)
        # Create final df with outliers left
        final_topics = self.global_model.get_topic_info()
        final_topics.index = final_topics.Name
        df_clusters.drop(columns=['Topic'], inplace=True)
        df_clusters = df_clusters.join(final_topics['Topic'], on='Name')
        df_clusters.Topic = df_clusters.Topic.fillna(-1).astype(int)
        df_clusters.drop(columns=[
            'Document', 'Representation', 'Representative_Docs', 
            'Top_n_words', 'Representative_document'], inplace=True)
        df_clusters.rename(columns={'Name':'cluster_name', 
            'Topic':'cluster_id', 'Probability':'cluster_probability'}, inplace=True)
        return df_clusters

    def filter_data_by_date(self, single_date):
        return self.gdf[self.gdf['date'] == single_date]

    def create_topic_model(self, docs):
        return BERTopic(embedding_model=self.embedding_model, umap_model=self.umap_model).fit(docs)

    def handle_clusters(self, df, topic_model, docs):
        # Create df with clusters for current model
        df = topic_model.get_document_info(docs, df)
        # Filter outlier texts into separate df
        df_outliers = df[df['Topic'] == -1].drop(columns=[
            'Document', 'Topic', 'Name', 'Representation', 'Representative_Docs', 
            'Top_n_words', 'Probability', 'Representative_document'])
        df = df[df["Topic"] != -1]
        return df, df_outliers

# Example of using the class
gdf = gpd.read_file("F:/Coding/all_parsed_11_04_2024_parser_1.1_fixed.geojson")
start_date = '2024-03-01'
end_date = '2024-03-05'
topic_modeler = TopicModeler(gdf, start_date, end_date)
df_clusters = topic_modeler.process_topics()


2024-03-01 00:00:00
2024-03-02 00:00:00
2024-03-03 00:00:00
2024-03-04 00:00:00
2024-03-05 00:00:00


In [9]:
df_clusters

Unnamed: 0,level_0,index,date,id,text,views.count,likes.count,reposts.count,type,link,...,full_street_name,location_options,addr_to_geocode,only_full_street_name,only_full_street_name_numbers,Location,geometry,cluster_name,cluster_probability,cluster_id
0,5996,47,2024-03-01,27739,"УСТРАНЕНИЕ ПРОДУВАНИЙ, ПРОТЕЧЕК ОКОН, ЛОДЖИЙ. ...",,0.0,0.0,post,https://vk.com/teploeosteklenie812[Ссылка]Замена,...,,[],,,,,,0_на_не_по_за,1.0,0
1,7100,14,2024-03-01,50912,ВНИМАНИЕ! ПОМОГИТЕ НАЙТИ ЧЕЛОВЕКА!Максимальный...,5142.0,50.0,20.0,post,,...,,[],,,,,,0_на_не_по_за,1.0,0
2,12865,50,2024-03-01,331147,"Мелочь, а приятно! , недавно обнаруженную нами...",2294.0,52.0,3.0,post,https://vk.com/wall-129354225_329416|Кроличью,...,"улица Восстания 30 Санкт-Петербург Россия,площ...","[улица Восстания 30 Санкт-Петербург Россия, пл...",улица Восстания 30 Санкт-Петербург Россия,улица Восстания,улица Восстания 30,"33, 30, улица Восстания, Литейный округ, Санкт...",POINT (30.36028 59.94012),0_на_не_по_за,1.0,0
3,12865,50,2024-03-01,331147,"Мелочь, а приятно! , недавно обнаруженную нами...",2294.0,52.0,3.0,post,https://vk.com/wall-129354225_329416|Кроличью,...,"улица Восстания 30 Санкт-Петербург Россия,площ...","[улица Восстания 30 Санкт-Петербург Россия, пл...",улица Восстания 30 Санкт-Петербург Россия,площадь Восстания,площадь Восстания 30,"33, 30, улица Восстания, Литейный округ, Санкт...",POINT (30.36028 59.94012),0_на_не_по_за,1.0,0
4,12865,50,2024-03-01,331147,"Мелочь, а приятно! , недавно обнаруженную нами...",2294.0,52.0,3.0,post,https://vk.com/wall-129354225_329416|Кроличью,...,"улица Восстания 30 Санкт-Петербург Россия,площ...","[улица Восстания 30 Санкт-Петербург Россия, пл...",площадь Восстания 30 Санкт-Петербург Россия,улица Восстания,улица Восстания 30,"площадь Восстания, округ Лиговка-Ямская, Санкт...",POINT (30.36191 59.93077),0_на_не_по_за,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,239168,90,2024-03-05,220705,ИНТЕРЕСНЫЕ МЕСТА РОССИИ. Форт-гигант Тотлебен ...,,2.0,1.0,post,https://pohod-vosemvrat.livejournal.com/328135...,...,"Первомайский переулок Санкт-Петербург Россия,...",[Первомайский переулок Санкт-Петербург Россия...,Первомайский переулок Санкт-Петербург Россия,Первомайский переулок,Первомайский переулок,"Первомайский переулок, Александровская, Санкт-...",POINT (30.33281 59.72935),,,-1
435,239168,90,2024-03-05,220705,ИНТЕРЕСНЫЕ МЕСТА РОССИИ. Форт-гигант Тотлебен ...,,2.0,1.0,post,https://pohod-vosemvrat.livejournal.com/328135...,...,"Первомайский переулок Санкт-Петербург Россия,...",[Первомайский переулок Санкт-Петербург Россия...,Первомайский переулок Санкт-Петербург Россия,Первомайский проспект,Первомайский проспект,"Первомайский переулок, Александровская, Санкт-...",POINT (30.33281 59.72935),,,-1
436,239168,90,2024-03-05,220705,ИНТЕРЕСНЫЕ МЕСТА РОССИИ. Форт-гигант Тотлебен ...,,2.0,1.0,post,https://pohod-vosemvrat.livejournal.com/328135...,...,"Первомайский переулок Санкт-Петербург Россия,...",[Первомайский переулок Санкт-Петербург Россия...,Первомайский проспект Санкт-Петербург Россия,Первомайский переулок,Первомайский переулок,"Первомайский проспект, Коломяги, округ Коломяг...",POINT (30.30398 60.03010),,,-1
437,239168,90,2024-03-05,220705,ИНТЕРЕСНЫЕ МЕСТА РОССИИ. Форт-гигант Тотлебен ...,,2.0,1.0,post,https://pohod-vosemvrat.livejournal.com/328135...,...,"Первомайский переулок Санкт-Петербург Россия,...",[Первомайский переулок Санкт-Петербург Россия...,Первомайский проспект Санкт-Петербург Россия,Первомайский проспект,Первомайский проспект,"Первомайский проспект, Коломяги, округ Коломяг...",POINT (30.30398 60.03010),,,-1


In [None]:
#  # Check clusters for updates
#     current_topics = global_model.get_topic_info()[['Name', 'Count']]
#     if previous_topics is None:
#         # Counter of inactivity is at 0 at the first iteration
#         current_topics['inactivity_count'] = 0
#         previous_topics = current_topics.copy()
#     else:
#         # Check for updates
#         current_topics = current_topics.merge(previous_topics[['Count', 'Name']], 
#             how='inner', on='Name', suffixes=(None, '_prev'))
#         current_topics['count_difference'] = current_topics['Count'] - current_topics['Count_prev']
#         current_topics.loc[current_topics['count_difference'] == 0, 'inactivity_count'] += 1
#         # Drop clusters with no activity for inacivity period
#         current_topics = current_topics[current_topics['inactivity_count'] < INACTIVITY_PERIOD]
#         # Remove these clusters from texts attributes
#         df_release = pd.concat(df_list)
#         df_release.loc[df_release['Name'].isin(current_topics['Name'].to_list()), 'Topic'] = None
#     global_docs = global_docs + docs