In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import osmnx as ox
import pandas as pd
import time
import json
from tqdm import tqdm
import plotly.io as pio
import numpy as np
pio.renderers.default = 'notebook'

RANDOM_STATE = 8
pd.set_option('display.max_rows', None)

# load unlabeled data

In [None]:
df1 = pd.read_csv('unlabeled_50k/wildfires_2018/text&image/bc_wildfires_2018_combined_5,629.csv')
df2 = pd.read_csv('unlabeled_50k/wildfires_2019/text&image/ab_wildfires_2019_combined_2,454.csv')
df3 = pd.read_csv('unlabeled_50k/wildfires_2020/text&image/wildfires_2020_combined_1,004.csv')
df4 = pd.read_csv('unlabeled_50k/wildfires_2021/text&image/wildfires_2021_combined_5,790.csv')
df5 = pd.read_csv('unlabeled_50k/wildfires_2022/text&image/wildfires_2022_combined_137.csv')
df6 = pd.read_csv('unlabeled_50k/wildfires_2023/text&image/wildfires_2023_combined_18,020.csv')
df7 = pd.read_csv('unlabeled_50k/wildfires_2024/text&image/wildfires_2024_combined_13,245.csv')
df1['data_num'] = 1
df2['data_num'] = 2
df3['data_num'] = 3
df4['data_num'] = 4
df5['data_num'] = 5
df6['data_num'] = 6
df7['data_num'] = 7
df1['image'] = 'unlabeled_50k/wildfires_2018/text&image/wildfires_2018_images/' + df1['image']
df2['image'] = 'unlabeled_50k/wildfires_2019/text&image/wildfires_2019_images/' + df2['image']
df3['image'] = 'unlabeled_50k/wildfires_2020/text&image/wildfires_2020_images/' + df3['image']
df4['image'] = 'unlabeled_50k/wildfires_2021/text&image/wildfires_2021_images/' + df4['image']
df5['image'] = 'unlabeled_50k/wildfires_2022/text&image/wildfires_2022_images/' + df5['image']
df6['image'] = 'unlabeled_50k/wildfires_2023/text&image/wildfires_2023_images/' + df6['image']
df7['image'] = 'unlabeled_50k/wildfires_2024/text&image/wildfires_2024_images/' + df7['image']
data = pd.concat([df1, df2, df3, df4, df5, df6, df7], ignore_index=True)
data = data.sort_values(by="posted_at", ignore_index=True)
data[:3]

In [None]:
# # save to then label with classifier
# data.to_csv('../U_data.csv', index=False)

In [None]:
# load after labeling with classifier
df = pd.read_csv('../U_data_pred.csv')
df = df.sort_values(by="posted_at", ignore_index=True)
df['data_num'] = data['data_num']
df[:3]

# load labeled data

In [None]:
df = pd.read_csv('../dataset.csv')

# extract author provinces using osmnx

In [None]:
# # run once to create loc_to_province.json (takes a while)
# unique_locs = df['author_loc'].dropna().unique()

# provinces = [
#     "Alberta", "British Columbia", "Manitoba", "New Brunswick", "Newfoundland and Labrador",
#     "Nova Scotia", "Ontario", "Prince Edward Island", "Quebec", "Saskatchewan", "Yukon", "Northwest Territories", "Nunavut"
# ]

# def find_province(input_string):
#     for province in provinces:
#         if province.lower() in input_string.lower():
#             return province
#     return None  # If no match is found

# loc_to_province = {}

# for loc in tqdm(unique_locs, desc="Processing locations"):
#     try:
#         result = ox.geocoder.geocode_to_gdf(loc, which_result=1)
#         if "Canada" in result.display_name[0]:
#             if result.addresstype[0] != "country":
#                 province = find_province(result.display_name[0])
#                 if province is not None:
#                     loc_to_province[loc] = province
#                 else:
#                     loc_to_province[loc] = 'province not found in list'
#             else:
#                 loc_to_province[loc] = 'address is canada'
#         else:
#             loc_to_province[loc] = 'address not in canada'
#     except Exception as e:
#         loc_to_province[loc] = 'address not found'
#     time.sleep(1)

# with open("loc_to_province.json", "w") as f:
#     json.dump(loc_to_province, f)

# map and display

In [None]:
# load saved loc_to_province.json
with open("loc_to_province.json", "r") as f:
    loc_to_province = json.load(f)

In [None]:
# map province to author_loc
df['province'] = df['author_loc'].map(loc_to_province)
df = df.reset_index(drop=True)

In [None]:
# show province counts
print(df['province'].value_counts())

In [None]:
# fix errors as needed here

In [None]:
# plot distribution of provinces

# Count the occurrences of each province
province_counts = df['province'].value_counts()
# Group provinces with 14 or fewer counts into 'Other'
other_count = province_counts[province_counts <= 14].sum()
province_counts = province_counts[province_counts > 14]
# Add the 'Other' category
province_counts["MB, NS, QC, PE"] = other_count

# Define distances for each label (same length as number of slices)
percent_distances = [0.5, 0.5, 0.7, 0.8, 0.7, 0.6, 0.5, 0.4]
label_distances = [1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.0][:len(province_counts)]

# Function for custom percent formatting
def make_autopct(distances):
    def autopct(pct):
        idx = make_autopct.index
        val = f'{pct:.1f}%'
        make_autopct.index += 1
        return val
    make_autopct.index = 0
    return autopct

# Plot
fig, ax = plt.subplots(figsize=(8, 8))
wedges, texts, autotexts = ax.pie(
    province_counts,
    labels=province_counts.index,
    autopct=make_autopct(percent_distances),
    startangle=140,
    colors=plt.cm.Paired.colors,
    textprops={'fontsize': 15, 'fontweight': 'bold'},
    pctdistance=1  # We'll override manually
)

# Manually reposition percent labels
for i, a in enumerate(autotexts):
    angle = (wedges[i].theta2 + wedges[i].theta1) / 2
    x = percent_distances[i] * np.cos(np.deg2rad(angle))
    y = percent_distances[i] * np.sin(np.deg2rad(angle))
    a.set_position((x, y))

# Manually reposition slice labels
for i, t in enumerate(texts):
    angle = (wedges[i].theta2 + wedges[i].theta1) / 2
    x = label_distances[i] * np.cos(np.deg2rad(angle))
    y = label_distances[i] * np.sin(np.deg2rad(angle))
    t.set_position((x, y))

ax.axis('equal')
plt.show()