In [70]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

### read CSV
property_file_path = '..data/curated/property/property 2.csv'
property_data = pd.read_csv(property_file_path)

property_data['geometry'] = property_data.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
property_gdf = gpd.GeoDataFrame(property_data, geometry='geometry', crs='EPSG:4326')

### SA2 layer
file_path = '../data/extracted_files/G01_VIC_GDA2020.gpkg'
gdf_sa2 = gpd.read_file(file_path, layer='G01_SA2_2021_VIC')
gdf_sa2 = gdf_sa2.to_crs(epsg=4326)
merged_gdf = gpd.sjoin(property_gdf, gdf_sa2[['SA2_CODE_2021', 'SA2_NAME_2021', 'geometry']], how='left', op='within')

### save
output_file_path = '../data/raw/property/merged_property_sa2_data.csv'
merged_gdf.drop_duplicates(subset='address', inplace=True)
merged_gdf[['address', 'SA2_CODE_2021', 'SA2_NAME_2021']].to_csv(output_file_path, index=False)
print('saved')

saved


  if await self.run_code(code, result, async_=asy):


In [71]:
import pandas as pd

# read all files 
property_file = '..data/curated/property/property 2.csv'
police_stations_file = '../data/curated/external/API/2/closest_supermarkets_distance_final_2.csv'
schools_file = '../data/curated/external/API/2/closest_school_distance_final_2.csv'
stations_file = '../data/curated/external/API/2/closest_stations_distance_final_2.csv'
supermarkets_file = '../data/curated/external/API/2/closest_supermarket_distance_final_2.csv.csv'
cbd_file = '../data/curated/external/API/2/closest_cbd_distance_final_2.csv'
gym_file = '../data/curated/external/API/2/closest_gym_distance_final_2.csv'
library_file = '../data/curated/external/API/2/closest_library_distance_final_2.csv'
new_merged_file = '../data/raw/property/merged_property_sa2_data.csv'

property_df = pd.read_csv(property_file)
police_df = pd.read_csv(police_stations_file)
schools_df = pd.read_csv(schools_file)
stations_df = pd.read_csv(stations_file)
supermarkets_df = pd.read_csv(supermarkets_file)
cbd_df = pd.read_csv(cbd_file)
gym_df = pd.read_csv(gym_file)
library_df = pd.read_csv(library_file)
new_merged_df = pd.read_csv(new_merged_file)

# Rename all 'rent_address' columns 'address'
dfs = [police_df, schools_df, stations_df, supermarkets_df, cbd_df, gym_df,library_df]
for df in dfs:
    df.rename(columns={'rent_address': 'address'}, inplace=True)

police_df.rename(columns={'minimum_distance_police_station': 'minimum_distance_police'}, inplace=True)

# Merge all data into the property data in turn, using 'rent_address' as the merge key
merged_df = property_df.merge(police_df, on='address', how='left')
merged_df = merged_df.merge(schools_df, on='address', how='left')
merged_df = merged_df.merge(stations_df, on='address', how='left')
merged_df = merged_df.merge(supermarkets_df, on='address', how='left')
merged_df = merged_df.merge(cbd_df, on='address', how='left')
merged_df = merged_df.merge(gym_df, on='address', how='left')
merged_df = merged_df.merge(library_df, on='address', how='left')

# Merge the newly uploaded data with the previously merged results
merged_df = new_merged_df.merge(merged_df, on='address', how='left')

# Delete the 'Unnamed: 0' and 'agentName' columns
merged_df.drop(columns=['Unnamed: 0', 'agentName'], inplace=True)

# Remove the column containing the 'unknown' value
merged_df = merged_df.loc[:, ~(merged_df == 'unknown').any()]


In [72]:
sa2_file = '../data/curated/external/SA2/sa2final.csv'
sa2_df = pd.read_csv(sa2_file)

# Rename the 'sa2code' column 'SA2_CODE_2021' for merging
sa2_df.rename(columns={'SA2 code': 'SA2_CODE_2021'}, inplace=True)
# Merge as per column 'SA2_CODE_2021'
merged_df = pd.merge(merged_df, sa2_df, on='SA2_CODE_2021', how='left')

In [None]:
price_post = pd.read_csv('../data/curated/external/unique_postcode_price.csv')
uni_school = pd.read_csv('../data/raw/external/school/postcode_school.csv')

unique_price_df = pd.read_csv(price_post)
postcode_school_df = pd.read_csv(uni_school)

# Rename the 'Address_Postcode' column to 'postcode' for merge
postcode_school_df.rename(columns={'Address_Postcode': 'postcode'}, inplace=True)

merged_by_postcode_df = pd.merge(merged_df, unique_price_df, on='postcode', how='left')
merged_by_postcode_df = pd.merge(merged_by_postcode_df, postcode_school_df, on='postcode', how='left')

In [None]:
crime = pd.read_csv('../data/curated/crime_by_year.csv')

### get the crime form and divided into years
crime_df = pd.read_csv('../data/curated/crime_by_year.csv')
crime_df.rename(columns={'Postcode': 'postcode'}, inplace=True)

crime_2021 = crime[crime['Year'] == 2021]
crime_2021 = crime_2021.rename(columns={
    'Offence Count':'2021crime'
})
crime_2022 = crime[crime['Year'] == 2022]
crime_2022 = crime_2022.rename(columns={
    'Offence Count':'2022crime'
})
crime_2023 = crime[crime['Year'] == 2023]
crime_2023 = crime_2023.rename(columns={
    'Offence Count':'2023crime'
})

### merge them
merged_by_postcode_df = pd.merge(merged_by_postcode_df,crime_2021,how='left',left_on='postcode',right_on='Postcode')
merged_by_postcode_df = merged_by_postcode_df.drop(['Year','Postcode'],axis=1)
merged_by_postcode_df = pd.merge(merged_by_postcode_df,crime_2022,how='left',left_on='postcode',right_on='Postcode')
merged_by_postcode_df = merged_by_postcode_df.drop(['Year','Postcode'],axis=1)
merged_by_postcode_df = pd.merge(merged_by_postcode_df,crime_2023,how='left',left_on='postcode',right_on='Postcode')
merged_by_postcode_df = merged_by_postcode_df.drop(['Year','Postcode'],axis=1)


In [74]:
#drop unnecessary columns
merged_by_postcode_df.drop(columns=['SA2 name'], inplace=True)
merged_by_postcode_df.drop(columns=['latitude'], inplace=True)
merged_by_postcode_df.drop(columns=['longitude'], inplace=True)
merged_by_postcode_df.drop(columns=['Unnamed: 0_x'], inplace=True)
merged_by_postcode_df.drop(columns=['Unnamed: 0_y'], inplace=True)

In [75]:
final_output_file_path = '../data/curated/external/final_data_2.csv'
merged_by_postcode_df.to_csv(final_output_file_path, index=False)

print("save to ", final_output_file_path)

最终合并的文件已保存至: /Users/fenglingyi/Downloads/groupwork_final/final_data_2.csv
