**Finding the missing values in the new features**

In [12]:
import pandas as pd

# Load the dataset
file_path = 'data\\panel_dataset.xlsx'
df = pd.read_excel(file_path)

# Create a list of years to interpolate
years_to_interpolate = list(range(2008, 2024))

# Define the interpolation function
def interpolate_values(data_2006, data_2016, neighborhood, row, years):
    results = []
    for year in years:
        if year != 2011 and year != 2016:    # skip over 2011 and 2016
            interpolated_row = row.copy()
            interpolated_row['Neighborhood'] = neighborhood # Add enighborhood value
            interpolated_row['Year'] = year
            for column in df.columns[2:]:  # Skip 'Neighborhood' and 'Year' columns
                y1 = data_2006.at[row.name, column]
                y2 = data_2016.at[row.name, column]
                x = year
                x1 = 2006
                x2 = 2016
                y = y1 + (((x - x1) * (y2 - y1)) / (x2 - x1))
                interpolated_row[column] = y
            results.append(interpolated_row)
    return results

# Extract unique neighborhoods
neighborhoods = df['Neighborhood'].unique()

# Initialize an empty list to collect interpolated rows
interpolated_rows = []

# Iterate through each neighborhood
for neighborhood in neighborhoods:
    neighborhood_data = df[df['Neighborhood'] == neighborhood]
    data_2006 = neighborhood_data[neighborhood_data['Year'] == 2006].set_index('Neighborhood').drop(columns=['Year'])
    data_2016 = neighborhood_data[neighborhood_data['Year'] == 2016].set_index('Neighborhood').drop(columns=['Year'])
    
    if not data_2006.empty and not data_2016.empty:
        # Interpolate values and extend the list of interpolated rows
        for index, row in data_2006.iterrows():
            interpolated_rows.extend(interpolate_values(data_2006, data_2016, neighborhood, row, years_to_interpolate))

# Convert the list of interpolated rows to a DataFrame
interpolated_df = pd.DataFrame(interpolated_rows)

# Concatenate the original and interpolated data
result_df = pd.concat([df, interpolated_df], ignore_index=True)

# Sort the DataFrame by 'Neighborhood' and 'Year'
result_df = result_df.sort_values(by=['Neighborhood', 'Year'])

# Save the updated DataFrame to a new Excel file
result_df.to_excel('data\\panel_dataset_interpolated.xlsx', index=False)


Remove the rows with irrelevant years

In [13]:
import pandas as pd

# Load the interpolated dataset
file_path = 'data\\panel_dataset_interpolated.xlsx'
df = pd.read_excel(file_path)

# Filter rows within the range of 2008-2023
df_filtered = df[(df['Year'] >= 2008) & (df['Year'] <= 2023)]

# Save the filtered DataFrame to a new Excel file
output_file = 'data\\new_features_updated.xlsx'
df_filtered.to_excel(output_file, index=False)

print(f"Filtered dataset saved to {output_file}")


Filtered dataset saved to data\new_features_updated.xlsx


Add the corresponding neighborhood_id

In [15]:
import pandas as pd

# Load the updated features dataset with neighborhood_id column added
file_path = 'data\\new_features_updated.xlsx'
df = pd.read_excel(file_path)

# Define the mapping of neighborhood names to IDs
neighborhood_mapping = {
    'West End': 14, 'Downtown': 1, 'Strathcona': 20, 'Grandview': 16, 'Hastings': 2,
    'West Point Grey': 22, 'Kitsilano': 19, 'Fairview': 11, 'Mount Pleasant': 12, 'Dunbar': 10,
    'Arbutus': 15, 'Shaughnessy': 8, 'South Cambie': 7, 'Riley Park': 6, 'Kensington': 17,
    'Renfrew': 13, 'Kerrisdale': 3, 'Oakridge': 5, 'Sunset': 21, 'Victoria': 9,
    'Killarney': 18, 'Marpole': 4
}

# Map neighborhood names to IDs and update the neighborhood_id column
df['neighborhood_id'] = df['Neighborhood'].map(neighborhood_mapping)

# Save the updated DataFrame with neighborhood IDs to a new Excel file
output_file = 'data\\new_features_updated_ids.xlsx'
df.to_excel(output_file, index=False)

print(f"Updated dataset with neighborhood IDs saved to {output_file}")


Updated dataset with neighborhood IDs saved to data\new_features_updated_ids.xlsx


Update the Merged panel dataset

In [17]:
import pandas as pd

# Load the datasets
file_merged = 'data\\merged_Final_merf.xlsx'
file_updated = 'data\\new_features_updated_ids.xlsx'

df_merged = pd.read_excel(file_merged)
df_updated = pd.read_excel(file_updated)

# Perform the merge based on 'Year' and 'neighborhood_id'
merged_df = pd.merge(df_merged, df_updated, on=['Year', 'neighborhood_id'], how='left')

# Drop the original 'Neighborhood' column from df_merged if needed
merged_df.drop(columns=['Neighborhood'], inplace=True)  # Uncomment if 'Neighborhood' is already in df_updated

# Save the merged DataFrame to a new Excel file
output_file = 'data\\merged_final_with_updated_features_double.xlsx'
merged_df.to_excel(output_file, index=False)

print(f"Merged dataset with updated features saved to {output_file}")


Merged dataset with updated features saved to data\merged_final_with_updated_features.xlsx
