In [3]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import MultiPoint
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer

ImportError: Unable to import required dependencies:
numpy: No module named 'numpy._utils'

In [2]:
!pip install numpy

Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/b9/c6/cd4298729826af9979c5f9ab02fcaa344b82621e7c49322cd2d210483d3f/numpy-2.2.3-cp311-cp311-win_amd64.whl.metadata
  Using cached numpy-2.2.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Using cached numpy-2.2.3-cp311-cp311-win_amd64.whl (12.9 MB)
Installing collected packages: numpy


ERROR: Could not install packages due to an OSError: [WinError 5] Åtkomst nekad: 'D:\\Anaconda3\\Lib\\site-packages\\numpy\\linalg\\_umath_linalg.cp311-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [None]:
filepath = "https://docs.google.com/spreadsheets/d/14ztIrZ3BsqPgJlEbw84kYlZykzfluZFynpK9ENTJ5UA/export?format=csv&gid=0"

# https://docs.google.com/spreadsheets/d/14ztIrZ3BsqPgJlEbw84kYlZykzfluZFynpK9ENTJ5UA/edit?gid=0#gid=0
# https://docs.google.com/spreadsheets/d/14ztIrZ3BsqPgJlEbw84kYlZykzfluZFynpK9ENTJ5UA/export?format=csv&gid=0


df = pd.read_csv(filepath)

df.head()

In [None]:

# -------------------------------
# 2. Convert Coordinates to Floats
# -------------------------------
# Replace commas with periods and cast to float.
df['Latitude'] = df['Latitude'].astype(str).str.replace(',', '.').astype(float)
df['Longitude'] = df['Longitude'].astype(str).str.replace(',', '.').astype(float)

# -------------------------------
# 3. Create a GeoDataFrame & Project
# -------------------------------
# Create GeoDataFrame (x=Longitude, y=Latitude)
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude))
gdf.crs = "EPSG:4326"

# Reproject to a metric CRS so that distances (e.g., 100 meters) are meaningful.
# Here, using UTM zone 47N (EPSG:32647); adjust if needed.
gdf = gdf.to_crs(epsg=32647)

# -------------------------------
# 4. Spatial Clustering (DBSCAN)
# -------------------------------
# Extract projected coordinates (in meters)
coords = np.array([[geom.x, geom.y] for geom in gdf.geometry])
# Cluster points that are within 100 meters (min_samples=2)
spatial_db = DBSCAN(eps=100, min_samples=2)
gdf['spatial_cluster'] = spatial_db.fit_predict(coords)

print("Spatial clustering results:")
print(gdf[['Timestamp', 'spatial_cluster']])

# -------------------------------
# 5. Semantic Text Clustering Using SentenceTransformer
# -------------------------------
# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def text_clustering_embeddings(texts, eps=0.4, min_samples=1):
    """
    Cluster a list of text strings using SentenceTransformer embeddings and DBSCAN.
    
    eps: Maximum cosine distance between samples to be considered in the same cluster.
    min_samples: Minimum number of samples in a cluster.
    """
    if len(texts) == 0:
        return []
    # Generate embeddings for the texts
    embeddings = model.encode(texts)
    # Cluster using DBSCAN with cosine metric (distance = 1 - cosine similarity)
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    labels = clustering.fit_predict(embeddings)
    return labels

# Combine spatial clusters, Opinion grouping, and semantic text clusters.
final_cluster_ids = {}

for spatial_label in gdf['spatial_cluster'].unique():
    if spatial_label == -1:
        # For noise points, assign a unique ID.
        for idx in gdf[gdf['spatial_cluster'] == -1].index:
            final_cluster_ids[idx] = f"noise_{idx}"
    else:
        sub_gdf = gdf[gdf['spatial_cluster'] == spatial_label]
        # Further group by Opinion so that only points with the same opinion are clustered together.
        for opinion, group in sub_gdf.groupby('Opinion'):
            texts = group['Message'].tolist()
            # Use semantic clustering on the messages.
            text_labels = text_clustering_embeddings(texts, eps=0.4, min_samples=1)
            for idx, t_label in zip(group.index, text_labels):
                final_cluster_ids[idx] = f"sp{spatial_label}_{opinion}_t{t_label}"

# Map final cluster IDs back to the GeoDataFrame.
gdf['final_cluster'] = gdf.index.map(final_cluster_ids)

print("\nFinal clusters (spatial + opinion + semantic):")
print(gdf[['Timestamp', 'final_cluster', 'Message']])

# -------------------------------
# 6. Generate Cluster Polygons & Summaries
# -------------------------------
polygons = {}
summaries = {}

# For each final cluster, compute the convex hull polygon and a summary.
for cluster_id in gdf['final_cluster'].unique():
    cluster_points = gdf[gdf['final_cluster'] == cluster_id]
    if len(cluster_points) > 0:
        # Create a convex hull for the cluster points.
        polygon = MultiPoint(list(cluster_points.geometry)).convex_hull
        polygons[cluster_id] = polygon
        # Simple summary: join all messages in the cluster (customize as needed)
        summaries[cluster_id] = " | ".join(cluster_points['Message'].tolist())

clusters_gdf = gpd.GeoDataFrame({
    'final_cluster': list(polygons.keys()),
    'summary': [summaries[cid] for cid in polygons.keys()],
    'geometry': list(polygons.values())
}, crs=gdf.crs)

print("\nCluster polygons and summaries:")
print(clusters_gdf)

# -------------------------------
# 7. Plot the Results
# -------------------------------
fig, ax = plt.subplots(figsize=(10, 8))
clusters_gdf.plot(ax=ax, alpha=0.5, edgecolor='k', column='final_cluster', legend=True)
gdf.plot(ax=ax, color='red', markersize=5)
plt.title("Final Clusters with Polygons")
plt.xlabel("Easting (m)")
plt.ylabel("Northing (m)")
plt.show()