In [35]:
import pandas as pd
from sklearn.cluster import DBSCAN
from shapely.geometry import Point, MultiPoint
import geopandas as gpd
from transformers import pipeline


In [36]:
filepath = "https://docs.google.com/spreadsheets/d/14ztIrZ3BsqPgJlEbw84kYlZykzfluZFynpK9ENTJ5UA/export?format=csv&gid=0"

# https://docs.google.com/spreadsheets/d/14ztIrZ3BsqPgJlEbw84kYlZykzfluZFynpK9ENTJ5UA/edit?gid=0#gid=0
# https://docs.google.com/spreadsheets/d/14ztIrZ3BsqPgJlEbw84kYlZykzfluZFynpK9ENTJ5UA/export?format=csv&gid=0


df = pd.read_csv(filepath)

df.head()

Unnamed: 0,Timestamp,Latitude,Longitude,Message,Opinion
0,2025-02-14 10.54.56,13842585,100571712,Risky intersection with fast moped traffic,Negative
1,2025-02-14 10.55.00,13842632,100571648,Intersection unsafe due to reckless driving,Negative
2,2025-02-14 10.55.14,13842752,100571884,High-speed cars make this intersection hazardous,Negative
3,2025-02-14 10.55.32,138427,100571696,"Vehicles racing through the intersection, used...",Negative
4,2025-02-14 10.56.33,1384258,100571664,Intersection known for accidents and speeding,Negative


In [37]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, MultiPoint
import hdbscan
from transformers import pipeline

##############################################################################
# 1) Load / Prepare Your DataFrame (df) with columns:
#    ['Timestamp', 'Latitude', 'Longitude', 'Message', 'Opinion']
##############################################################################

# Example: If you've already got df, skip these lines:
# df = pd.read_csv("your_data.csv")

# Map Opinion to numerical Value
mapping = {'Negative': -1, 'Neutral': 0, 'Positive': 1}
df['Value'] = df['Opinion'].map(mapping)

# Convert coordinates from string with commas to float
def convert_coord(coord):
    if isinstance(coord, str):
        return float(coord.replace(',', '.'))
    return float(coord)

df['Latitude'] = df['Latitude'].apply(convert_coord)
df['Longitude'] = df['Longitude'].apply(convert_coord)

##############################################################################
# 2) Summarization Pipeline: T5 with chunk-based approach
##############################################################################

# Initialize summarizer with a T5 model
summarizer = pipeline(
    "summarization",
    model="google/flan-t5-large",  # Try other T5 or Pegasus models if you prefer
    do_sample=False,               # More deterministic output
    truncation=True                # Truncate if input is too long
)

def chunk_summarize(messages, 
                    chunk_size=5, 
                    max_length=40, 
                    min_length=5):
    """
    1. Split 'messages' into smaller chunks.
    2. Summarize each chunk individually.
    3. Summarize the concatenation of those chunk summaries.
    """
    # Summaries of each chunk
    chunk_summaries = []
    
    for i in range(0, len(messages), chunk_size):
        chunk = messages[i:i+chunk_size]
        text = " ".join(chunk)
        
        # Summarize the chunk with stricter parameters
        partial_summary = summarizer(
            text, 
            max_length=max_length, 
            min_length=min_length,
            no_repeat_ngram_size=3,   # reduce repeated phrases
            early_stopping=True
        )
        chunk_summaries.append(partial_summary[0]['summary_text'])
    
    # Now summarize the concatenated chunk_summaries
    combined_text = " ".join(chunk_summaries)
    final_summary = summarizer(
        combined_text,
        max_length=max_length,
        min_length=min_length,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
    
    return final_summary[0]['summary_text']

##############################################################################
# 3) Group by Opinion, then cluster each group spatially with HDBSCAN
##############################################################################

clusters_list = []

for opinion in df['Opinion'].unique():
    sub_df = df[df['Opinion'] == opinion].copy()
    coords = sub_df[['Longitude', 'Latitude']].values
    
    # Skip if too few points
    if len(coords) < 3:
        continue
    
    # HDBSCAN clustering
    clusterer = hdbscan.HDBSCAN(min_cluster_size=3)
    sub_df['cluster'] = clusterer.fit_predict(coords)
    
    for cl in sorted(sub_df['cluster'].unique()):
        if cl == -1:  # ignore noise
            continue
        cluster_data = sub_df[sub_df['cluster'] == cl]
        
        # Build a convex hull from the points
        points = [
            Point(lon, lat) 
            for lon, lat in zip(cluster_data['Longitude'], cluster_data['Latitude'])
        ]
        if not points:
            continue
        
        hull = MultiPoint(points).convex_hull
        agg_value = cluster_data['Value'].sum()
        
        # Summarize all messages in this cluster (chunk-based)
        messages = cluster_data['Message'].tolist()
        summary_text = chunk_summarize(messages, chunk_size=5, max_length=40, min_length=5)
        
        clusters_list.append({
            'Opinion': opinion,
            'cluster': f"{opinion}_{cl}",
            'AggregateValue': agg_value,
            'Summary': summary_text,
            'geometry': hull
        })

# Convert to GeoDataFrame
clusters_gdf_opinion = gpd.GeoDataFrame(clusters_list, crs="EPSG:4326")

##############################################################################
# 4) Save the Result to GeoJSON for QGIS
##############################################################################



clusters_gdf_opinion.to_file("clusters_by_opinion_improved2.geojson", driver="GeoJSON")

# Print a preview
print("Clusters with Summaries:")
print(clusters_gdf_opinion[['Opinion','cluster','AggregateValue','Summary']])
print("\nSaved to 'clusters_by_opinion_improved.geojson'")


Your max_length is set to 40, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 40, but your input_length is only 6. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 40, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)
Your max_length is set to 40, but your input_length is only 30. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_len

PermissionError: [WinError 32] Det går inte att komma åt filen eftersom den
används av en annan process: 'clusters_by_opinion_improved.geojson'