In [16]:
import pandas as pd
from geopy.distance import geodesic
import numpy as np

In [17]:
geo = pd.read_csv('data/filtered_RJ_lat_long_data.csv')
final = pd.read_csv('data/final_combined_dataset.csv')

final.isnull().sum()

date                                    0
week                                    0
cases                                   0
population                              0
tempe_min                               0
humidity_max                            0
humidity_avg                            0
humidity_min                            0
temp_avg                                0
temp_max                                0
city                                    0
geocode                                 0
vim                                     0
vim_monthly                             0
precipitation_avg_ordinary_kriging      0
precipitation_max_ordinary_kriging      0
precipitation_avg_regression_kriging    0
precipitation_max_regression_kriging    0
dtype: int64

In [18]:
geo['CITY_lower'] = geo['CITY'].str.lower()
final['city_lower'] = final['city'].str.lower()

# Merge datasets using the lowercase city columns
final = pd.merge(final, geo[['CITY_lower', 'LONG', 'LAT']], left_on='city_lower', right_on='CITY_lower', how='left')

final = final.drop(columns=['CITY_lower', 'city_lower'])

# rename 
final = final.rename(columns={'LONG': 'long', 'LAT': 'lat'})


In [19]:
final['cases_per_100k'] = final['cases'] / final['population'] * 100000

Spatial Aggregation: Create additional features that represent the cases of nearby cities.
For example:
Average cases of neighboring cities.

nearby_cases_weighted: Weighted average of cases in the 3 nearest cities for the same week.

here we will consider cases_per_100k as cases. Because we want to consider the population of the city in the calculation.

In [None]:
final.to_csv('data/final_combined_dataset.csv', index=False)

In [13]:
data = pd.read_csv('data/combined_dataset.csv')


def calculate_weighted_cases(row, weekly_data, top_k=3):
    base_coords = (row['lat'], row['long'])
    distances = []

    # Iterate through other cities in the same week
    for _, city_row in weekly_data.iterrows():
        if city_row['city'] != row['city']:
            # Calculate distance between cities
            dist = geodesic(base_coords, (city_row['lat'], city_row['long'])).kilometers
            distances.append((city_row['city'], city_row['cases_per_100k'], dist))

    # Sort by distance and take the top_k nearest cities
    nearest = sorted(distances, key=lambda x: x[2])[:top_k]

    # Compute weighted cases: sum(weighted cases) / sum(weights)
    weighted_cases = sum(city[1] / city[2] for city in nearest) / sum(1 / city[2] for city in nearest) if nearest else 0

    return weighted_cases


# Calculate nearby_cases_weighted
nearby_weighted_cases = []
for week in data['week'].unique():
    weekly_data = data[data['week'] == week]
    print(f'Calculating weighted cases for week {week}')
    for _, row in weekly_data.iterrows():
        weighted_cases = calculate_weighted_cases(row, weekly_data)
        nearby_weighted_cases.append(weighted_cases)

# Add the new column to the dataset
data['nearby_cases_weighted'] = nearby_weighted_cases

data.to_csv('data/combined_dataset.csv', index=False)

KeyboardInterrupt: 