In [3]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from tqdm import tqdm

In [4]:
dissemination_area_file = "data/statscan/census_2021/dissemination_areas/lda_000a21a_e/lda_000a21a_e.shp"
census_file = "data/statscan/census_2021/98-401-X2021006_Quebec_eng_CSV/98-401-X2021006_English_CSV_data_Quebec.csv"
census_encoding = "ISO-8859-1"

In [None]:
# Read dissemination area file
dissemination_areas = gpd.read_file(dissemination_area_file)
print(dissemination_areas.head())

In [None]:
# Read census file (~6 GB)
census = pd.read_csv(census_file, encoding=census_encoding)
print(census.head())

In [5]:
dissemination_areas_dguids = dissemination_areas["DGUID"]
census_dguids = census["DGUID"]

In [None]:
common_dguids = set(dissemination_areas_dguids) & set(census_dguids)
print(len(common_dguids))
print(list(common_dguids)[:5])

In [None]:
census_qc = census[census["DGUID"].isin(common_dguids)]
dissemination_areas_qc = dissemination_areas[dissemination_areas["DGUID"].isin(common_dguids)]

print(f"census_qc: {len(census_qc)}/{len(census)} ({len(census_qc)/len(census)}%)")
print(f"dissemination_areas_qc: {len(dissemination_areas_qc)}/{len(dissemination_areas)} ({len(dissemination_areas_qc)/len(dissemination_areas)*100} %)")

# Part 1: Build a new census geospatial dataset for Quebec
Use dissemination areas as spatial unit.

In [8]:
def combine_hierarchical_rows(df, characteristics_of_interest, indent_column='CHARACTERISTIC_NAME'):
    # Create a copy of the dataframe
    df = df.copy()
    
    # Find indentation level for each row
    df['indent_level'] = df[indent_column].str.len() - df[indent_column].str.lstrip().str.len()
    
    result_rows = []
    
    for characteristic in characteristics_of_interest:
        # Find each occurrence of the characteristic
        char_indices = df[df[indent_column].str.strip() == characteristic].index
        
        for char_idx in char_indices:
            # Add the characteristic row itself
            current_row = df.loc[char_idx].copy()
            current_row['parent'] = None  # or '' if you prefer
            current_row['full_hierarchy'] = characteristic
            result_rows.append(current_row)
            
            # Get the base indentation level
            base_level = df.loc[char_idx, 'indent_level']
            hierarchy_stack = []
            
            # Get all rows that come after this characteristic
            subsequent_rows = df.loc[char_idx + 1:]
            
            # Keep rows until we hit another row with same or lower indentation
            for idx, row in subsequent_rows.iterrows():
                if row['indent_level'] <= base_level:
                    break
                
                # Update hierarchy stack based on indentation
                while hierarchy_stack and row['indent_level'] <= df.loc[hierarchy_stack[-1], 'indent_level']:
                    hierarchy_stack.pop()
                    
                # Add row to results with hierarchy information
                current_row = row.copy()
                current_row['parent'] = characteristic if not hierarchy_stack else df.loc[hierarchy_stack[-1], indent_column].strip()
                
                # Build full hierarchy path
                hierarchy_path = [characteristic] + [df.loc[i, indent_column].strip() for i in hierarchy_stack] + [row[indent_column].strip()]
                current_row['full_hierarchy'] = " > ".join(hierarchy_path)
                
                result_rows.append(current_row)
                hierarchy_stack.append(idx)
    
    # Create new dataframe from collected rows
    if result_rows:
        result_df = pd.DataFrame(result_rows)
        result_df = result_df.drop('indent_level', axis=1)
        return result_df
    else:
        return pd.DataFrame(columns=df.columns)

In [9]:
CHARACTERISTICS_OF_INTEREST = [
    "Total - Age groups of the population - 100% data",
    "Population, 2021",
    "Population, 2016",
    "Total private dwellings",
    "Private dwellings occupied by usual residents",
    "Land area in square kilometres",
    "Total - Household after-tax income groups in 2020 for private households - 100% data",
    "Total - Income statistics for private households - 100% data",
    "Unemployment rate",
]

In [None]:
# Pre-process census data using groupby
census_grouped = census_qc.groupby('DGUID').apply(
    lambda x: combine_hierarchical_rows(x, CHARACTERISTICS_OF_INTEREST, indent_column="CHARACTERISTIC_NAME")
)

In [None]:
census_grouped

In [None]:
# Create a list to store all rows
all_rows = []

# Process each DGUID
for dguid in tqdm(common_dguids):
    census_at_dguid = census_grouped.loc[dguid]
    dissemination_area_at_dguid = dissemination_areas[dissemination_areas["DGUID"]==dguid]

    # Create the new row data
    new_row_data = {
        row["full_hierarchy"]: row["C1_COUNT_TOTAL"]
        for _, row in census_at_dguid.iterrows()
    }
    new_row_data["DGUID"] = dissemination_area_at_dguid["DGUID"].iloc[0]
    new_row_data["geometry"] = dissemination_area_at_dguid["geometry"].iloc[0]
    all_rows.append(new_row_data)

# Create final GeoDataFrame at once
combined_gdf = gpd.GeoDataFrame(all_rows)
combined_gdf.crs = dissemination_areas.crs
# combined_gdf["DGUID"] = combined_gdf["DGUID"].astype(str)

# Save files
combined_gdf.to_parquet(
    "data/output/census_2021_qc_parsed.parquet",
    compression="snappy",
    index=False,
    engine="pyarrow",
)

In [None]:
combined_gdf

# Part 2: Combine with school data
Goal is to associate schools to StatsCan census regions. Ultimately, we want to be able to apply census data to determine socio-economic factors that apply to schools.

School district algo idea:
- Get school lat/lon
- Read stastcan census data in surroundinga area
- Grow outwards, accumulating students until school is full

TODO:
- [x] Merge full_addresses_avec_coor.csv with merged_clusttered_data.csv (join on Code) to get lat lon with "type" (prive ou public) and "Langue Enseignement"
- [x] Exclude rows with "type" == "prive" and "Langue Enseignement" == "Anglais"

In [None]:
# Merge school names, ids and lat/lon into new csv
# Merge full_addresses_avec_coor.csv with merged_clusttered_data.csv (join on Code) to get lat lon with "Type" (prive ou public) and "Langue Enseignement"

a = pd.read_csv("data/school/merged_clustered_data.csv", index_col="Address")
b = pd.read_csv("data/school/full_addresses_avec_coor.csv", index_col="original_address")
schools = a.join(b, how="inner", lsuffix="_a", rsuffix="_b")
schools = schools.set_index("Code")
schools = schools[["School Name", "lat", "lon", "Full Address", "Code Postal", "Aggregate Dissemination Area Code", "Number of students", "Type", "Langue Enseignement"]]
schools.to_csv("data/output/schools_qc_basic.csv")

In [None]:
print(len(a))
print(len(b))
print(len(schools))
print(schools["Type"].value_counts())
print(schools["Langue Enseignement"].value_counts())
schools

In [None]:
# Now build spatial relationship between schools and stascan dissemination areas polygons

# Read files
schools = pd.read_csv("data/output/schools_qc_basic.csv")
diss_areas = gpd.read_parquet("data/output/census_2021_qc_parsed.parquet")
combined_gdf = gpd.read_parquet("data/output/census_2021_qc_parsed.parquet")
schools_gdf = gpd.GeoDataFrame(
    schools,
    geometry=[Point(lon, lat) for lon, lat in zip(schools['lon'], schools['lat'])],
    crs="EPSG:4326"  # Assuming input coordinates are in WGS84
)

# Keep schools that are not english and are public
schools_fr_public_gdf = schools_gdf[(~schools_gdf["Langue Enseignement"].str.contains("EN", na=False)) & (schools_gdf["Type"] == "Public")]
print(f"Number of schools: {len(schools_gdf)}")
print(f"Number of non-english, public schools: {len(schools_fr_public_gdf)}")

# Reproject both datasets UTM
utm_crs = schools_gdf.estimate_utm_crs()
schools_fr_public_gdf = schools_fr_public_gdf.to_crs(utm_crs)
diss_areas_gdf = diss_areas.to_crs(utm_crs)
combined_gdf = combined_gdf.to_crs(utm_crs)

# Match
diss_areas_gdf = gpd.sjoin_nearest(diss_areas_gdf, schools_fr_public_gdf[['Code', 'geometry']])
diss_areas_gdf.drop(columns=['index_right'], inplace=True)

# Save full datasets
diss_areas_gdf.to_parquet("data/output/census_2021_qc_parsed_with_schools_fr_public.parquet", compression="snappy", index=False, engine="pyarrow")
diss_areas_gdf.to_csv("data/output/census_2021_qc_parsed_with_schools_fr_public.csv")
# Save simple mapping only
diss_areas_gdf[["DGUID", "Code"]].to_csv("data/output/census_2021_qc_parsed_with_schools_fr_public_mapping.csv")

diss_areas_gdf.head()