In [1]:
# activate autoreload
%load_ext autoreload
%autoreload 2

# check if session is in Google Colab
try:
    import google.colab
    IN_COLAB = True
    print('Google Colab session!')
except:
    IN_COLAB = False
    print('Not a Google Colab session.')

# add src path to the notebook
import os
import sys
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT: str = '/content/drive/MyDrive/papers/2025b_relevance_2.0'
else:
    PROJECT_ROOT: str = os.path.dirname(os.path.abspath(os.path.dirname("__file__")))
if PROJECT_ROOT not in sys.path:
    sys.path.append(os.path.join(PROJECT_ROOT))
print(PROJECT_ROOT)

Not a Google Colab session.
/mnt/c/Users/DavidHanny/OneDrive - IT U interdisciplinary transformation university austria/Documents/projects/papers/2025_GSAI_RES_LLM_Contextual_Predictions


In [5]:
!nvidia-smi

Fri Apr 25 06:40:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.03             Driver Version: 572.60         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A500 Laptop GPU     On  |   00000000:01:00.0 Off |                  N/A |
| N/A   54C    P0              9W /   20W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# download required spacy models
!python -m spacy download en_core_web_trf
!python -m spacy download en_core_web_lg
!python -m spacy download xx_ent_wiki_sm

In [None]:
# download gazetteer
!python -m src.irchel_geoparser download geonames

# **Bluesky Data Collection**
In this notebook, we collect social media data from Bluesky regarding two events:
- the January 2025 Southern California (SoCal) wildfires
- the September 2024 Western Europe floods

In [None]:
import gc
import json
import torch
import time
import spacy
import numpy as np
import pandas as pd
import geopandas as gpd
import src.bsky_search as bsky
from typing import List, Dict
from datetime import datetime, timedelta
from pathlib import Path
from tqdm.auto import tqdm
from shapely.geometry import MultiPoint
from src.irchel_geoparser.geoparser import Geoparser
from src.location_extraction.spacy_ner import extract_locations_spacy
from src.location_extraction.roberta_ner import extract_named_entities_roberta
tqdm.pandas()

# print available GPUs
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"[{i}] {torch.cuda.get_device_name(i)}")
else:
    print("No CUDA GPUs detected.")

DATA_PATH: str = os.path.join(PROJECT_ROOT, 'data')
print(DATA_PATH)

## **1. SoCal wildfires**
We start off with the Southern California wildfires. Since Bluesky does not allow for effective search based on location, we need to use a combination of temporal and keyword filters.

$\rightarrow$ This was moved to `/scripts/crawl_bsky_socal_wildfires.py`.

## **2. Western Europe floods**
Next, we can do the equivalent for the 2024 floods in Western Europe.

$\rightarrow$ This was moved to `/scripts/crawl_bsky_europe_floods.py`.

## **3. Parsing**
I fear I have messed up the saving of the parsed results to CSV. But I do have the raw JSON results. So let's perhaps parse that again manually.

In [8]:
# INPUT_PATH: str = os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'bsky_central_europe_raw.v2.json')
# OUTPUT_PATH: str = os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'bsky_central_europe_df.parquet')

INPUT_PATH: str = os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_raw.v2.json')
OUTPUT_PATH: str = os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_df.parquet')

In [9]:
with open(INPUT_PATH, 'r') as file:
	raw_posts = json.load(file)
print(type(raw_posts))
print(len(raw_posts))
print(raw_posts[:10])

<class 'list'>
600206
[{'uri': 'at://did:plc:jodd33f7ckgxejmuvn2jx2mw/app.bsky.feed.post/3ldz6vd6o6k2u', 'cid': 'bafyreiaplar6sf427q46nwwc6fuvipcdtre7xieg4rbn5qarzrgyidchfy', 'author': {'did': 'did:plc:jodd33f7ckgxejmuvn2jx2mw', 'handle': 'rsutibu.bsky.social', 'displayName': 'rSutibu (Closed Comms/ Slots:0/8)', 'avatar': 'https://cdn.bsky.app/img/avatar/plain/did:plc:jodd33f7ckgxejmuvn2jx2mw/bafkreicq5hn7ginarnfoprtq4hq3f6qkvpvwpmnwu5r7i637cufzsh4hoy@jpeg', 'associated': {'chat': {'allowIncoming': 'all'}}, 'labels': [], 'createdAt': '2023-12-31T20:09:05.421Z'}, 'record': {'$type': 'app.bsky.feed.post', 'createdAt': '2024-12-24T00:29:59.418Z', 'langs': ['en'], 'reply': {'parent': {'cid': 'bafyreiaxto7bnoivob7f5frvtqld7ae2cio6noepknwetzkfmh7daval6i', 'uri': 'at://did:plc:7mlaytba5ctn2sbd4ulvdu2u/app.bsky.feed.post/3ldz4u4wiyc2m'}, 'root': {'cid': 'bafyreifymdgcqnqfakmjfpgig2lo3zu7q6qmxht4r76z5ngclyhycl43su', 'uri': 'at://did:plc:jodd33f7ckgxejmuvn2jx2mw/app.bsky.feed.post/3ldyonjgjdk2h'

In [10]:
# parse tha raw posts
parsed_posts: List[Dict] = [bsky.parse_post(post) for post in raw_posts]
parsed_posts_df: pd.DataFrame = pd.DataFrame.from_dict(parsed_posts)

# drop duplicate entries
parsed_posts_df = parsed_posts_df.drop_duplicates('cid')

# remove empty posts
parsed_posts_df = parsed_posts_df[parsed_posts_df['text'].astype(str).str.len() > 0]

# and store the results´
print(parsed_posts_df.shape)
parsed_posts_df.to_parquet(OUTPUT_PATH, index=False)
parsed_posts_df

(570570, 17)


Unnamed: 0,cid,uri,author_displayName,author_handle,author_did,createdAt,langs,text,replyCount,repostCount,likeCount,quoteCount,reply_parent_cid,reply_root_cid,image_thumbnails,image_fullsizes,urls
0,bafyreiaplar6sf427q46nwwc6fuvipcdtre7xieg4rbn5...,at://did:plc:jodd33f7ckgxejmuvn2jx2mw/app.bsky...,rSutibu (Closed Comms/ Slots:0/8),rsutibu.bsky.social,did:plc:jodd33f7ckgxejmuvn2jx2mw,2024-12-24T00:29:59.418Z,[en],warming... fire!?,1,0,1,0,bafyreiaxto7bnoivob7f5frvtqld7ae2cio6noepknwet...,bafyreifymdgcqnqfakmjfpgig2lo3zu7q6qmxht4r76z5...,[],[],[]
1,bafyreibaovld72qis6wd5xmu2owmw7o2eawpcd4k7e63o...,at://did:plc:3rxcbck5ni5ldif3pvj4ricw/app.bsky...,Nellie Pennington,whoa-nellie.bsky.social,did:plc:3rxcbck5ni5ldif3pvj4ricw,2024-12-24T00:29:47.972Z,[en],That time I had hot chocolate with the snowman...,0,0,3,0,,,[https://cdn.bsky.app/img/feed_thumbnail/plain...,[https://cdn.bsky.app/img/feed_fullsize/plain/...,[bafkreib2zuex4hywb5v3jntsl5b2vdyiambaib5nxswq...
2,bafyreihrauazmbzzuwxgiam5kvm3iygepclphl57kexrt...,at://did:plc:jedlel7nmsgraeenznupxrf3/app.bsky...,Saint D'Khari,saintdkhari.bsky.social,did:plc:jedlel7nmsgraeenznupxrf3,2024-12-24T00:29:41.279Z,[en],Oh yeah that sounds fire,0,0,1,0,bafyreihqshfmmile24ungwpbnxs2z2erhvrwrclk27uei...,bafyreih3hppo5jliwmrpzf7kzvc3ob62bzno5nq5noyj4...,[],[],[]
3,bafyreif2n4ryjuxtawo52giqu4rbxmxkohnq76y5wnhkm...,at://did:plc:skgjhoxiu2upm4e57l2ld3li/app.bsky...,Tilly,hortense1177bce.bsky.social,did:plc:skgjhoxiu2upm4e57l2ld3li,2024-12-24T00:29:27.405Z,[en],I prefer being here to Twitter but I also feel...,2,0,2,0,,,[],[],[]
4,bafyreieqwf6wnjkf3vy2aoqyqroa5vqcdnjc3cec6nahc...,at://did:plc:cl37gznnibucnz6dnsqv2vpr/app.bsky...,elliott junkyard,transarchivist.bsky.social,did:plc:cl37gznnibucnz6dnsqv2vpr,2024-12-24T00:29:19.501Z,[en],this whole thread is fire ❤️‍🔥,0,0,2,0,,,[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600201,bafyreiblckmpnru7ibeurj5dlwzzudutomwljwkc7artw...,at://did:plc:kuhx6uckdmd465x4bumqxkdk/app.bsky...,Facundo Dell Aqua,facundodellaqua.bsky.social,did:plc:kuhx6uckdmd465x4bumqxkdk,2025-02-14T23:43:52.610Z,[es],"Jajajajajaja, el final es una fatality",0,0,2,0,bafyreih2rory3qauljgas5ojkjk6b3h6o5gpbz4rfdz6u...,bafyreifemihlizfbq55lkj7fne6xlvpi7pynyfatnty3v...,[],[],[]
600202,bafyreifwy5xwesv5kczu4mgclv3ptyuzu6vjzxoxac5mh...,at://did:plc:o5bzpjeblsmh45q2hpwswh56/app.bsky...,,devrawiz.bsky.social,did:plc:o5bzpjeblsmh45q2hpwswh56,2025-02-14T23:41:00.396Z,[en],It looks like mobile homes and bulldozers are ...,0,1,2,0,bafyreidqnlyviql3y22vyxc2e2dyv3xzbvxmzqeox46o4...,bafyreidqnlyviql3y22vyxc2e2dyv3xzbvxmzqeox46o4...,[],[],[]
600203,bafyreicckfaghaqchbmtre4w3vknpd6qac2xwmffprzvv...,at://did:plc:hslirr7h6d7livppaxty435b/app.bsky...,Thiard News@F4F,newsen.bsky.social,did:plc:hslirr7h6d7livppaxty435b,2025-02-14T23:58:45Z,,Forecast for Indoor Air Quality Solution Marke...,0,0,0,0,,,[],[],[https://third-news.com/article/a020f3bc-eb2f-...
600204,bafyreigqs4y4we4rhzh6modc5tjmm2ek677rfrjihofpb...,at://did:plc:h6vtwq2qkazjor6uykvdcurp/app.bsky...,"PA Environment Digest, By David E. Hess, Forme...",paenvironmentnews.bsky.social,did:plc:h6vtwq2qkazjor6uykvdcurp,2025-02-14T23:54:11.044Z,[en],DEP Sets March 19 Hearing On RACT 3 Air Qualit...,0,0,0,0,,,[https://cdn.bsky.app/img/feed_thumbnail/plain...,[https://cdn.bsky.app/img/feed_fullsize/plain/...,[bafkreicq6mdtr7fykcvfja2rdzi5mringpiebp5reldw...


## **4. Location extraction**
Okay nice! We made it. Now let's look at the data in more detail and try to extract locations from the text.

In [5]:
europe_floods_df: pd.DataFrame = pd.read_parquet(
    os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'bsky_central_europe_df.parquet')
)
socal_wildfires_df: pd.DataFrame = pd.read_parquet(
    os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_df.parquet')
)
print(europe_floods_df.shape)
print(socal_wildfires_df.shape)

# fix the language column
for df in [europe_floods_df, socal_wildfires_df]:
    df['language'] = df['langs'].apply(lambda x: x[0] if isinstance(x, np.ndarray) and len(x) > 0 else np.nan)
    df['language'] = df['language'].apply(lambda x: x[:2] if isinstance(x, str) and len(x) > 2 else x)

Initialising custom Geoparser
(105767, 17)
(570570, 17)


In [6]:
for name, df in [('2024 Central Europe floods', europe_floods_df), ('2025 SoCal wildfires', socal_wildfires_df)]:
    language_stats: pd.DataFrame = pd.DataFrame(df['language'].value_counts())
    language_stats = language_stats[language_stats['count'] > 100]
    print(f"{len(language_stats)} languages with more than 100 posts in {name}:")
    print(language_stats.index.tolist())
    print(language_stats)

11 languages with more than 100 posts in 2024 Central Europe floods:
['en', 'es', 'pt', 'de', 'nl', 'fr', 'ja', 'sv', 'ca', 'cs', 'pl']
          count
language       
en        57075
es        17159
pt        11157
de        10830
nl          754
fr          541
ja          438
sv          214
ca          195
cs          156
pl          137
20 languages with more than 100 posts in 2025 SoCal wildfires:
['en', 'pt', 'es', 'fr', 'de', 'ja', 'it', 'nl', 'nb', 'id', 'da', 'sv', 'pl', 'ca', 'tr', 'fi', 'ar', 'uk', 'zh', 'ru']
           count
language        
en        421379
pt         22687
es          4580
fr          3345
de          2817
ja          1410
it           902
nl           782
nb           600
id           419
da           339
sv           295
pl           273
ca           261
tr           187
fi           152
ar           122
uk           119
zh           118
ru           118


### **4.1. Irchel Geoparser**
Diego Gomes from the University of Zurich has developed a quite nice library for Geoparsing. Let's see if it works for our purposes. I needed to adapt the source code a bit to mitigate some errors and introduce garbage collection. However, it did not fix the problem entirely. For the SoCal wildfire data, I still needed to split up the dataframe.

In [3]:
socal_wildfires_df: pd.DataFrame = pd.read_parquet(
    os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_df.parquet')
)
n: int = len(socal_wildfires_df)
half = n // 2

socal_wildfires_df.iloc[:half].to_parquet(
    os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_df_part1.parquet'),
    index=False
)
socal_wildfires_df.iloc[half:].to_parquet(
    os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_df_part2.parquet'),
    index=False
)

Subsequently, I was able to run geoparsing with the script in `scripts/geoparsing/geoparser.py` on our HPC cluster. Its usage is explained in the readme.

Let's see how it did.

In [21]:
europe_floods_df: pd.DataFrame = pd.read_parquet(
    os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'bsky_central_europe_df_geocoded.parquet')
)
socal_wildfires_df: pd.DataFrame = pd.concat([
    pd.read_parquet(os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_df_part1_geocoded.parquet')),
    pd.read_parquet(os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_df_part2_geocoded.parquet'))
])
print(europe_floods_df.shape)
print(socal_wildfires_df.shape)


# fix the language column
for df in [europe_floods_df, socal_wildfires_df]:
    df['language'] = df['langs'].apply(lambda x: x[0] if isinstance(x, np.ndarray) and len(x) > 0 else np.nan)
    df['language'] = df['language'].apply(lambda x: x[:2] if isinstance(x, str) and len(x) > 2 else x)
socal_wildfires_df

(105767, 18)
(570570, 18)


Unnamed: 0,cid,uri,author_displayName,author_handle,author_did,createdAt,langs,text,replyCount,repostCount,likeCount,quoteCount,reply_parent_cid,reply_root_cid,image_thumbnails,image_fullsizes,urls,geocoded_dict,language
0,bafyreiaplar6sf427q46nwwc6fuvipcdtre7xieg4rbn5...,at://did:plc:jodd33f7ckgxejmuvn2jx2mw/app.bsky...,rSutibu (Closed Comms/ Slots:0/8),rsutibu.bsky.social,did:plc:jodd33f7ckgxejmuvn2jx2mw,2024-12-24T00:29:59.418Z,[en],warming... fire!?,1,0,1,0,bafyreiaxto7bnoivob7f5frvtqld7ae2cio6noepknwet...,bafyreifymdgcqnqfakmjfpgig2lo3zu7q6qmxht4r76z5...,[],[],[],"{'text': 'warming... fire!?', 'toponyms': []}",en
1,bafyreibaovld72qis6wd5xmu2owmw7o2eawpcd4k7e63o...,at://did:plc:3rxcbck5ni5ldif3pvj4ricw/app.bsky...,Nellie Pennington,whoa-nellie.bsky.social,did:plc:3rxcbck5ni5ldif3pvj4ricw,2024-12-24T00:29:47.972Z,[en],That time I had hot chocolate with the snowman...,0,0,3,0,,,[https://cdn.bsky.app/img/feed_thumbnail/plain...,[https://cdn.bsky.app/img/feed_fullsize/plain/...,[bafkreib2zuex4hywb5v3jntsl5b2vdyiambaib5nxswq...,{'text': 'That time I had hot chocolate with t...,en
2,bafyreihrauazmbzzuwxgiam5kvm3iygepclphl57kexrt...,at://did:plc:jedlel7nmsgraeenznupxrf3/app.bsky...,Saint D'Khari,saintdkhari.bsky.social,did:plc:jedlel7nmsgraeenznupxrf3,2024-12-24T00:29:41.279Z,[en],Oh yeah that sounds fire,0,0,1,0,bafyreihqshfmmile24ungwpbnxs2z2erhvrwrclk27uei...,bafyreih3hppo5jliwmrpzf7kzvc3ob62bzno5nq5noyj4...,[],[],[],"{'text': 'Oh yeah that sounds fire', 'toponyms...",en
3,bafyreif2n4ryjuxtawo52giqu4rbxmxkohnq76y5wnhkm...,at://did:plc:skgjhoxiu2upm4e57l2ld3li/app.bsky...,Tilly,hortense1177bce.bsky.social,did:plc:skgjhoxiu2upm4e57l2ld3li,2024-12-24T00:29:27.405Z,[en],I prefer being here to Twitter but I also feel...,2,0,2,0,,,[],[],[],{'text': 'I prefer being here to Twitter but I...,en
4,bafyreieqwf6wnjkf3vy2aoqyqroa5vqcdnjc3cec6nahc...,at://did:plc:cl37gznnibucnz6dnsqv2vpr/app.bsky...,elliott junkyard,transarchivist.bsky.social,did:plc:cl37gznnibucnz6dnsqv2vpr,2024-12-24T00:29:19.501Z,[en],this whole thread is fire ❤️‍🔥,0,0,2,0,,,[],[],[],"{'text': 'this whole thread is fire ❤️‍🔥', 'to...",en
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285280,bafyreiblckmpnru7ibeurj5dlwzzudutomwljwkc7artw...,at://did:plc:kuhx6uckdmd465x4bumqxkdk/app.bsky...,Facundo Dell Aqua,facundodellaqua.bsky.social,did:plc:kuhx6uckdmd465x4bumqxkdk,2025-02-14T23:43:52.610Z,[es],"Jajajajajaja, el final es una fatality",0,0,2,0,bafyreih2rory3qauljgas5ojkjk6b3h6o5gpbz4rfdz6u...,bafyreifemihlizfbq55lkj7fne6xlvpi7pynyfatnty3v...,[],[],[],"{'text': 'Jajajajajaja, el final es una fatali...",es
285281,bafyreifwy5xwesv5kczu4mgclv3ptyuzu6vjzxoxac5mh...,at://did:plc:o5bzpjeblsmh45q2hpwswh56/app.bsky...,,devrawiz.bsky.social,did:plc:o5bzpjeblsmh45q2hpwswh56,2025-02-14T23:41:00.396Z,[en],It looks like mobile homes and bulldozers are ...,0,1,2,0,bafyreidqnlyviql3y22vyxc2e2dyv3xzbvxmzqeox46o4...,bafyreidqnlyviql3y22vyxc2e2dyv3xzbvxmzqeox46o4...,[],[],[],{'text': 'It looks like mobile homes and bulld...,en
285282,bafyreicckfaghaqchbmtre4w3vknpd6qac2xwmffprzvv...,at://did:plc:hslirr7h6d7livppaxty435b/app.bsky...,Thiard News@F4F,newsen.bsky.social,did:plc:hslirr7h6d7livppaxty435b,2025-02-14T23:58:45Z,,Forecast for Indoor Air Quality Solution Marke...,0,0,0,0,,,[],[],[https://third-news.com/article/a020f3bc-eb2f-...,{'text': 'Forecast for Indoor Air Quality Solu...,
285283,bafyreigqs4y4we4rhzh6modc5tjmm2ek677rfrjihofpb...,at://did:plc:h6vtwq2qkazjor6uykvdcurp/app.bsky...,"PA Environment Digest, By David E. Hess, Forme...",paenvironmentnews.bsky.social,did:plc:h6vtwq2qkazjor6uykvdcurp,2025-02-14T23:54:11.044Z,[en],DEP Sets March 19 Hearing On RACT 3 Air Qualit...,0,0,0,0,,,[https://cdn.bsky.app/img/feed_thumbnail/plain...,[https://cdn.bsky.app/img/feed_fullsize/plain/...,[bafkreicq6mdtr7fykcvfja2rdzi5mringpiebp5reldw...,{'text': 'DEP Sets March 19 Hearing On RACT 3 ...,en


For now, let's parse the geocoded locations to a multipoint geometry.

In [22]:
def toponyms_to_multipoint(toponyms):
    """
    Convert a list of toponyms with lat/lon information to a MultiPoint geometry.
    """    
    if len(toponyms) <= 0:
        return MultiPoint()
    # note: MultiPoint expects (x, y) = (lon, lat)

    coords: list = []
    for tp in toponyms:
        if tp['longitude'] is not None and tp['latitude'] is not None:
            coords.append((tp['longitude'], tp['latitude']))
    return MultiPoint(coords)

def toponyms_to_place_list(toponyms):
    """
    Convert a list of toponyms to a list of place names.
    """    
    if len(toponyms) <= 0:
        return []
    return [tp['text'] for tp in toponyms]

# extract coordinates and place names from the geocoding information
for df in [europe_floods_df, socal_wildfires_df]:
    df['place'] = df['geocoded_dict'].apply(lambda geocoding_info: toponyms_to_place_list(geocoding_info.get('toponyms', [])))
    df['geometry'] = df['geocoded_dict'].apply(lambda geocoding_info: toponyms_to_multipoint(geocoding_info.get('toponyms', [])))
socal_wildfires_df

Unnamed: 0,cid,uri,author_displayName,author_handle,author_did,createdAt,langs,text,replyCount,repostCount,...,quoteCount,reply_parent_cid,reply_root_cid,image_thumbnails,image_fullsizes,urls,geocoded_dict,language,place,geometry
0,bafyreiaplar6sf427q46nwwc6fuvipcdtre7xieg4rbn5...,at://did:plc:jodd33f7ckgxejmuvn2jx2mw/app.bsky...,rSutibu (Closed Comms/ Slots:0/8),rsutibu.bsky.social,did:plc:jodd33f7ckgxejmuvn2jx2mw,2024-12-24T00:29:59.418Z,[en],warming... fire!?,1,0,...,0,bafyreiaxto7bnoivob7f5frvtqld7ae2cio6noepknwet...,bafyreifymdgcqnqfakmjfpgig2lo3zu7q6qmxht4r76z5...,[],[],[],"{'text': 'warming... fire!?', 'toponyms': []}",en,[],MULTIPOINT EMPTY
1,bafyreibaovld72qis6wd5xmu2owmw7o2eawpcd4k7e63o...,at://did:plc:3rxcbck5ni5ldif3pvj4ricw/app.bsky...,Nellie Pennington,whoa-nellie.bsky.social,did:plc:3rxcbck5ni5ldif3pvj4ricw,2024-12-24T00:29:47.972Z,[en],That time I had hot chocolate with the snowman...,0,0,...,0,,,[https://cdn.bsky.app/img/feed_thumbnail/plain...,[https://cdn.bsky.app/img/feed_fullsize/plain/...,[bafkreib2zuex4hywb5v3jntsl5b2vdyiambaib5nxswq...,{'text': 'That time I had hot chocolate with t...,en,[],MULTIPOINT EMPTY
2,bafyreihrauazmbzzuwxgiam5kvm3iygepclphl57kexrt...,at://did:plc:jedlel7nmsgraeenznupxrf3/app.bsky...,Saint D'Khari,saintdkhari.bsky.social,did:plc:jedlel7nmsgraeenznupxrf3,2024-12-24T00:29:41.279Z,[en],Oh yeah that sounds fire,0,0,...,0,bafyreihqshfmmile24ungwpbnxs2z2erhvrwrclk27uei...,bafyreih3hppo5jliwmrpzf7kzvc3ob62bzno5nq5noyj4...,[],[],[],"{'text': 'Oh yeah that sounds fire', 'toponyms...",en,[],MULTIPOINT EMPTY
3,bafyreif2n4ryjuxtawo52giqu4rbxmxkohnq76y5wnhkm...,at://did:plc:skgjhoxiu2upm4e57l2ld3li/app.bsky...,Tilly,hortense1177bce.bsky.social,did:plc:skgjhoxiu2upm4e57l2ld3li,2024-12-24T00:29:27.405Z,[en],I prefer being here to Twitter but I also feel...,2,0,...,0,,,[],[],[],{'text': 'I prefer being here to Twitter but I...,en,[],MULTIPOINT EMPTY
4,bafyreieqwf6wnjkf3vy2aoqyqroa5vqcdnjc3cec6nahc...,at://did:plc:cl37gznnibucnz6dnsqv2vpr/app.bsky...,elliott junkyard,transarchivist.bsky.social,did:plc:cl37gznnibucnz6dnsqv2vpr,2024-12-24T00:29:19.501Z,[en],this whole thread is fire ❤️‍🔥,0,0,...,0,,,[],[],[],"{'text': 'this whole thread is fire ❤️‍🔥', 'to...",en,[],MULTIPOINT EMPTY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285280,bafyreiblckmpnru7ibeurj5dlwzzudutomwljwkc7artw...,at://did:plc:kuhx6uckdmd465x4bumqxkdk/app.bsky...,Facundo Dell Aqua,facundodellaqua.bsky.social,did:plc:kuhx6uckdmd465x4bumqxkdk,2025-02-14T23:43:52.610Z,[es],"Jajajajajaja, el final es una fatality",0,0,...,0,bafyreih2rory3qauljgas5ojkjk6b3h6o5gpbz4rfdz6u...,bafyreifemihlizfbq55lkj7fne6xlvpi7pynyfatnty3v...,[],[],[],"{'text': 'Jajajajajaja, el final es una fatali...",es,[],MULTIPOINT EMPTY
285281,bafyreifwy5xwesv5kczu4mgclv3ptyuzu6vjzxoxac5mh...,at://did:plc:o5bzpjeblsmh45q2hpwswh56/app.bsky...,,devrawiz.bsky.social,did:plc:o5bzpjeblsmh45q2hpwswh56,2025-02-14T23:41:00.396Z,[en],It looks like mobile homes and bulldozers are ...,0,1,...,0,bafyreidqnlyviql3y22vyxc2e2dyv3xzbvxmzqeox46o4...,bafyreidqnlyviql3y22vyxc2e2dyv3xzbvxmzqeox46o4...,[],[],[],{'text': 'It looks like mobile homes and bulld...,en,[Rafah],MULTIPOINT ((34.24357 31.29722))
285282,bafyreicckfaghaqchbmtre4w3vknpd6qac2xwmffprzvv...,at://did:plc:hslirr7h6d7livppaxty435b/app.bsky...,Thiard News@F4F,newsen.bsky.social,did:plc:hslirr7h6d7livppaxty435b,2025-02-14T23:58:45Z,,Forecast for Indoor Air Quality Solution Marke...,0,0,...,0,,,[],[],[https://third-news.com/article/a020f3bc-eb2f-...,{'text': 'Forecast for Indoor Air Quality Solu...,,[USA],MULTIPOINT ((-98.5 39.76))
285283,bafyreigqs4y4we4rhzh6modc5tjmm2ek677rfrjihofpb...,at://did:plc:h6vtwq2qkazjor6uykvdcurp/app.bsky...,"PA Environment Digest, By David E. Hess, Forme...",paenvironmentnews.bsky.social,did:plc:h6vtwq2qkazjor6uykvdcurp,2025-02-14T23:54:11.044Z,[en],DEP Sets March 19 Hearing On RACT 3 Air Qualit...,0,0,...,0,,,[https://cdn.bsky.app/img/feed_thumbnail/plain...,[https://cdn.bsky.app/img/feed_fullsize/plain/...,[bafkreicq6mdtr7fykcvfja2rdzi5mringpiebp5reldw...,{'text': 'DEP Sets March 19 Hearing On RACT 3 ...,en,"[The Cleveland Cliffs Butler Works, Butler Cou...",MULTIPOINT ((-84.57566 39.43865))


With that ready, let's see what happens if we create a GeoDataFrame.

In [23]:
europe_floods_gdf: gpd.GeoDataFrame = gpd.GeoDataFrame(europe_floods_df, geometry='geometry', crs=4326)
europe_floods_gdf.to_parquet(os.path.join(DATA_PATH, 'raw', '2024_central_europe_floods', 'bsky_central_europe_gdf.parquet'))

socal_wildfires_gdf: gpd.GeoDataFrame = gpd.GeoDataFrame(socal_wildfires_df, geometry='geometry', crs=4326)
socal_wildfires_gdf.to_parquet(os.path.join(DATA_PATH, 'raw', '2025_socal_wildfires', 'bsky_socal_gdf.parquet'))

print(europe_floods_gdf[~europe_floods_gdf['geometry'].is_empty].shape)
print(socal_wildfires_gdf[socal_wildfires_gdf['geometry'].is_empty].shape)

(25092, 21)
(302245, 21)
