In [22]:
import pandas as pd
from geopy.distance import geodesic
import numpy as np

In [24]:
geo = pd.read_csv('data/filtered_RJ_lat_long_data.csv')
final = pd.read_csv('data/final_combined_dataset.csv')

final.isnull().sum()

date                                    0
week                                    0
cases                                   0
population                              0
tempe_min                               0
humidity_max                            0
humidity_avg                            0
humidity_min                            0
temp_avg                                0
temp_max                                0
city                                    0
geocode                                 0
vim                                     0
vim_monthly                             0
precipitation_avg_ordinary_kriging      0
precipitation_max_ordinary_kriging      0
precipitation_avg_regression_kriging    0
precipitation_max_regression_kriging    0
dtype: int64

In [25]:
geo['CITY_lower'] = geo['CITY'].str.lower()
final['city_lower'] = final['city'].str.lower()

# Merge datasets using the lowercase city columns
final = pd.merge(final, geo[['CITY_lower', 'LONG', 'LAT']], left_on='city_lower', right_on='CITY_lower', how='left')

final = final.drop(columns=['CITY_lower', 'city_lower'])

# rename 
final = final.rename(columns={'LONG': 'long', 'LAT': 'lat'})


In [26]:
final['cases_per_100k'] = final['cases'] / final['population'] * 100000

Spatial Aggregation: Create additional features that represent the cases of nearby cities.
For example:
Average cases of neighboring cities.

nearby_cases_weighted: Weighted average of cases in the 3 nearest cities for the same week.

here we will consider cases_per_100k as cases. Because we want to consider the population of the city in the calculation.

In [27]:
final.to_csv('data/final_combined_dataset.csv', index=False)

In [28]:
data = pd.read_csv('data/final_combined_dataset.csv')


def calculate_weighted_cases(row, weekly_data, top_k=3):
    base_coords = (row['lat'], row['long'])
    distances = []

    # Iterate through other cities in the same week
    for _, city_row in weekly_data.iterrows():
        if city_row['city'] != row['city']:
            # Calculate distance between cities
            dist = geodesic(base_coords, (city_row['lat'], city_row['long'])).kilometers
            distances.append((city_row['city'], city_row['cases_per_100k'], dist))

    # Sort by distance and take the top_k nearest cities
    nearest = sorted(distances, key=lambda x: x[2])[:top_k]

    # Compute weighted cases: sum(weighted cases) / sum(weights)
    weighted_cases = sum(city[1] / city[2] for city in nearest) / sum(1 / city[2] for city in nearest) if nearest else 0

    return weighted_cases

In [29]:
# First part: Weeks from 201201 to 201701
first_part_weeks = list(range(201201, 201701))

# Update for the first part
first_part_data = data[data['week'].isin(first_part_weeks)]
nearby_weighted_cases_first_part = []

for week in first_part_data['week'].unique():
    weekly_data = first_part_data[first_part_data['week'] == week]
    print(f'Processing week {week}')
    for _, row in weekly_data.iterrows():
        weighted_cases = calculate_weighted_cases(row, weekly_data)
        nearby_weighted_cases_first_part.append(weighted_cases)

data.loc[data['week'].isin(first_part_weeks), 'nearby_cases_weighted'] = nearby_weighted_cases_first_part

# Save the updated data back to CSV
data.to_csv('data/final_combined_dataset.csv', index=False)


Processing week 201201
Processing week 201202
Processing week 201203
Processing week 201204
Processing week 201205
Processing week 201206
Processing week 201207
Processing week 201208
Processing week 201209
Processing week 201210
Processing week 201211
Processing week 201212
Processing week 201213
Processing week 201214
Processing week 201215
Processing week 201216
Processing week 201217
Processing week 201218
Processing week 201219
Processing week 201220
Processing week 201221
Processing week 201222
Processing week 201223
Processing week 201224
Processing week 201225
Processing week 201226
Processing week 201227
Processing week 201228
Processing week 201229
Processing week 201230
Processing week 201231
Processing week 201232
Processing week 201233
Processing week 201234
Processing week 201235
Processing week 201236
Processing week 201237
Processing week 201238
Processing week 201239
Processing week 201240
Processing week 201241
Processing week 201242
Processing week 201243
Processing 