In [3]:
import pandas as pd
import os 
import matplotlib.pyplot as plt 
import re
import numpy as np

In [4]:
# load tree data from the municipality of Copenhagen 
cols = ['id','ny_dm_element','slaegtsnavn','planteaar','bydelsnavn','vejnavn','placering','wkb_geometry']
df = pd.read_csv('raw_data/trae_basis.csv',sep=',',usecols=cols)

# prepare tree data for analysis
df = df[df["planteaar"] >= 2000] # get data from 2000 and after 
df = df.dropna(subset=['planteaar']) # remove samples with unknown plant year 
df.columns = ['id','type','species','year','district','street','street_location','coordinates']

# Get the X and Y coordinates for geodata plot
def get_x(row):
    x = re.findall(r"[-+]?\d*\.\d+|\d+", row)[0]
    return float(x)
def get_y(row):
    y = re.findall(r"[-+]?\d*\.\d+|\d+", row)[1]
    return float(y)


df['X'] = df['coordinates'].apply(get_x)
df['Y'] = df['coordinates'].apply(get_y)
df = df.drop(columns = ["coordinates"])

# save tree data for exploratory tree 
df.to_csv('data/df_tree.csv', index=False) 

In [10]:
# load income data from the municipality of Copenhagen 
df1 = pd.read_excel('raw_data/2022419113038370864435KKIND3.xlsx', header=2, usecols = "C:Y")
df1.rename(columns = {'Unnamed: 2':'income_type', 'Unnamed: 3':'district'}, inplace = True)
df1['income_type'] = df1['income_type'].fillna(method='ffill')
df1['district'] = df1['district'].str.replace("Bydel - ", "")
df1['district'] = df1['district'].str.replace("/", "-")
df1 = df1[df1['district']!='København i alt']

df_income = df1.melt(id_vars=["district","income_type"],var_name="year", value_name="total")
df_income = (pd.pivot_table(df_income, values = "total", index = ["year", "district"], columns = ["income_type"], aggfunc = "sum")).reset_index()
df_income = df_income.drop(columns=['Indkomstbeløb (1.000 kr.)','Personer med indkomsten (antal)'])
df_income.to_csv('data/df_income.csv', index=False) # save income data

In [20]:
df2 = pd.read_excel('raw_data/2022419114031370864435KKUDD2.xlsx', header =2, usecols = "C:R")
df2.rename(columns = {'Unnamed: 2':'race', 'Unnamed: 3':'education', 'Unnamed: 4':'district'}, inplace = True)
df2['race'] = df2['race'].fillna(method='ffill')
df2['education'] = df2['education'].fillna(method='ffill')
df2['district'] = df2['district'].str.replace("Bydel - ", "")
df2['district'] = df2['district'].str.replace("/", "-")
df2 = df2[df2['district']!='København i alt']

df_education = df2.melt(id_vars=["district","race", "education"], var_name="year", value_name="total_educated")
df_education['year'] = df_education['year'].apply(int)
df_race = (pd.pivot_table(df_education, values = "total_educated", index = ["year", "district"], columns = ["race"], aggfunc = "sum")).reset_index()
df_level = (pd.pivot_table(df_education, values = "total_educated", index = ["year", "district"], columns = ["education"], aggfunc = "sum")).reset_index()

df_merged = pd.merge(df_race,df_level)
df_merged.to_csv('data/df_education.csv', index=False)