In [3]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [6]:
# Read the data
df = pd.read_csv('../data/raw/2015-street-tree-census-tree-data.csv')

Filter columns we definetly don't need for modelling.

We don't need them because some of them are duplicates, it's.

In [7]:
unused_cols = ["tree_id", "block_id", "created_at", "stump_diam", "spc_latin",
             "address", "postcode", "zip_city", "community_board", "borocode",
             "cncldist", "st_assem", "st_senate", "nta", "boro_ct", "state",
             "x_sp", "y_sp", "council district", "census tract", "bin", "bbl",
             'problems', "community board", "nta_name"]

filtered_cols = [col for col in df.columns if col not in unused_cols]
filtered_df = df[filtered_cols]

Leave only trees with "Alive" status

We drop them because due to filed definiion in case of stump or dead status we don't have value for health at all.

In [8]:
df_alive = filtered_df[filtered_df['status'] == "Alive"].copy()
df_alive.drop(columns=["status"], inplace=True)

Dealing with missing values

In [9]:
def check_missing_values(df):
    print("\n=== Missing Values Analysis ===")
    missing_values = df.isnull().sum()
    missing_percentages = (missing_values / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percentages
    })
    return missing_df[missing_df['Missing Values'] > 0]

In [10]:
check_missing_values(df_alive)


=== Missing Values Analysis ===


Unnamed: 0,Missing Values,Percentage
health,1,0.000153
spc_common,5,0.000767
steward,487823,74.799631
guards,572307,87.753863
sidewalk,1,0.000153


For small number of missing simply drop it.

In [11]:
df_no_health_missing = df_alive[df_alive['health'].notna()].copy()

In [12]:
check_missing_values(df_no_health_missing)


=== Missing Values Analysis ===


Unnamed: 0,Missing Values,Percentage
spc_common,5,0.000767
steward,487822,74.799593
guards,572306,87.753844
sidewalk,1,0.000153


In [13]:
further_filter = df_no_health_missing[(df_no_health_missing['spc_common'].notna()) & (df_no_health_missing['sidewalk'].notna())]

In [14]:
check_missing_values(further_filter)


=== Missing Values Analysis ===


Unnamed: 0,Missing Values,Percentage
steward,487819,74.799821
guards,572300,87.753731


Simply fill in with unknown because we have a big %of missing values and we can't simply use some other imputing techniques.

In [15]:
further_filter['steward'].fillna("Unknown", inplace=True)
further_filter['guards'].fillna("Unknown", inplace=True)

In [16]:
check_missing_values(further_filter)


=== Missing Values Analysis ===


Unnamed: 0,Missing Values,Percentage


Columns to keep for modeling

In [17]:
further_filter.columns

Index(['tree_dbh', 'curb_loc', 'health', 'spc_common', 'steward', 'guards',
       'sidewalk', 'user_type', 'root_stone', 'root_grate', 'root_other',
       'trunk_wire', 'trnk_light', 'trnk_other', 'brch_light', 'brch_shoe',
       'brch_other', 'borough', 'latitude', 'longitude'],
      dtype='object')

Save the processed dataset

In [18]:
further_filter.to_csv("../data/processed/filtered_data.csv", index=False)