In [71]:
import numpy as np
import pandas as pd

In [72]:
df = pd.read_csv('data/final_combined_dataset.csv')

In [73]:
# geocode_list = [3303807]
# df = df[df['geocode'].isin(geocode_list)]

In [74]:
df['week'] = pd.to_numeric(df['week'], errors='coerce')
df['date'] = pd.to_datetime(df['date'])

# Add date_ordinal
df['date_ordinal'] = df['date'].apply(lambda x: x.toordinal())

# Extract year and month
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

df['week'] = df['week'].astype(int) % 100

# sum of cases each year
cases_sum = df.groupby(['year'])['cases'].sum().reset_index()

# Add cyclic month representation
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Add cyclic week representation
df['week_sin'] = np.sin(2 * np.pi * df['week'] / 52)
df['week_cos'] = np.cos(2 * np.pi * df['week'] / 52)

In [75]:
# Step 2: Create lag features
def create_lags(dataframe, group_col, target_col, lags, inplace = False):
    if isinstance(target_col, list):  # If target_col is a list of columns
        for col in target_col:
            for lag in lags:
                if inplace:
                    dataframe[target_col] = dataframe.groupby(group_col)[col].shift(lag)
                else:
                    dataframe[f'{col}_lag{lag}'] = dataframe.groupby(group_col)[col].shift(lag)
    else:  # If target_col is a single column
        for lag in lags:
            if inplace:
                dataframe[target_col] = dataframe.groupby(group_col)[target_col].shift(lag)
            else:
                dataframe[f'{target_col}_lag{lag}'] = dataframe.groupby(group_col)[target_col].shift(lag)
    return dataframe

# Lag cases by 1 and 2 weeks
data = create_lags(df, group_col='city', target_col='cases', lags=[0, 1])

# Lag weather-related variables by 5 and 6 weeks for each city
weather_columns = ['tempe_min', 'temp_avg', 'humidity_avg', 'precipitation_avg_ordinary_kriging']
data = create_lags(data, group_col='city', target_col=weather_columns, lags=[3, 4])

# lag cases -4 weeks
data = create_lags(data, group_col='city', target_col='cases', lags=[-2], inplace = True)

data = data.dropna().reset_index(drop=True)

In [76]:
from sklearn.preprocessing import MinMaxScaler

# Step 4: Scale continuous variables, including lagged variables
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

continuous_vars = ['tempe_min', 'temp_avg', 'temp_max', 'humidity_max', 'humidity_avg', 'humidity_min', 'vim'
                   'precipitation_avg_ordinary_kriging', 'precipitation_max_ordinary_kriging', 'cases',
                   'precipitation_avg_regression_kriging', 'precipitation_max_regression_kriging',
                   'nearby_cases_weighted']
# Include lagged variables in the scaling process
lagged_vars = [col for col in data.columns if '_lag' in col]
scaler_vars = continuous_vars + lagged_vars

In [77]:
# Step 5: Split the data into training and testing sets based on time
train_data = data[(data['year'] >= 2012) & (data['year'] <= 2020)]
test_data = data[(data['year'] > 2020)]

selected_columns = [
    'month_sin', 'month_cos', 'week_sin', 'week_cos', 'city', 'week', 'year', 'population',
    'cases_lag0', 'cases_lag1',
     'temp_avg_lag3', 'humidity_avg_lag3', 'precipitation_avg_ordinary_kriging_lag3', 'temp_avg_lag4', 'humidity_avg_lag4', 'precipitation_avg_ordinary_kriging_lag4', 'vim', 'nearby_cases_weighted'
]

scaler_vars = list(set(selected_columns) & set(scaler_vars))

X_train = train_data[selected_columns]
y_train = train_data['cases']

X_test = test_data[selected_columns]
y_test = test_data['cases']

# scale
X_train[scaler_vars] = feature_scaler.fit_transform(X_train[scaler_vars])
X_test[scaler_vars] = feature_scaler.transform(X_test[scaler_vars])

# scale target
y_train = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = target_scaler.transform(y_test.values.reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[scaler_vars] = feature_scaler.fit_transform(X_train[scaler_vars])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[scaler_vars] = feature_scaler.transform(X_test[scaler_vars])


In [78]:
# from sklearn.preprocessing import LabelEncoder
# 
# # Ensure you're working with a copy of the DataFrame if necessary
# X_train = X_train.copy()  # If it's a slice, we create a new copy
# X_test = X_test.copy()    # Same here
# 
# # Step 1: Encode the city column
# label_encoder = LabelEncoder()
# X_train['city_encoded'] = label_encoder.fit_transform(X_train['city'])
# X_test['city_encoded'] = label_encoder.transform(X_test['city'])
# 
# # Step 2: Drop the original city column
# X_train = X_train.drop(columns=['city'])
# X_test = X_test.drop(columns=['city'])

# Convert city column to category dtype
X_train['city'] = X_train['city'].astype('category')
X_test['city'] = X_test['city'].astype('category')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['city'] = X_train['city'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['city'] = X_test['city'].astype('category')


In [79]:
# Step 6: Prepare LightGBM datasets
# train_dataset = lgb.Dataset(X_train, label=y_train)
# test_dataset = lgb.Dataset(X_test, label=y_test, reference=train_dataset)

In [80]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.seasonal import STL

def detrend_series(series, period):
    stl = STL(series, seasonal=period)
    result = stl.fit()
    return result.trend, series - result.trend


period = y_train.index.freq.n
y_train_trend, y_train_residual = detrend_series(y_train, period)
y_test_trend, y_test_residual = detrend_series(y_test, period)

# Apply STL decomposition to each feature in X
X_train_residual = X_train.copy()
X_test_residual = X_test.copy()
X_train_trend = X_train.copy()
X_test_trend = X_test.copy()

for col in X_train.columns:
    X_train_trend[col], X_train_residual[col] = detrend_series(X_train[col])
    X_test_trend[col], X_test_residual[col] = detrend_series(X_test[col])

# Create LightGBM datasets with detrended data
train_dataset = lgb.Dataset(X_train_residual, label=y_train_residual)
test_dataset = lgb.Dataset(X_test_residual, label=y_test_residual)

# LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# Train the model
callbacks = [lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=100)]
model = lgb.train(params, train_dataset, valid_sets=[train_dataset, test_dataset], num_boost_round=1000, callbacks=callbacks)

# Predict residuals
train_pred_residual = model.predict(X_train_residual)
test_pred_residual = model.predict(X_test_residual)

# Add back the trend to get final predictions
y_train_pred = train_pred_residual + y_train_trend
y_test_pred = test_pred_residual + y_test_trend

# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train RMSE: {train_rmse:.4f}, Train R²: {train_r2:.4f}')
print(f'Test RMSE: {test_rmse:.4f}, Test R²: {test_r2:.4f}')

AttributeError: 'numpy.ndarray' object has no attribute 'index'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a DataFrame for feature importance
importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importance()
})

# Sort by importance
importance = importance.sort_values(by='Importance', ascending=False)

# Print top features in a neat table
print("Top 10 Most Important Features:")
print(importance.head(10))

# Visualize feature importance
plt.figure(figsize=(10, 14))
plt.barh(importance['Feature'], importance['Importance'], color='skyblue')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # Invert y-axis to show the highest importance at the top
plt.tight_layout()
plt.show()


In [None]:
rmse = np.sqrt(mean_squared_error(y_test_actual, y_test_pred))
print(f'RMSE: {rmse}')

# Step 2: Plot the graph
plt.figure(figsize=(16, 8))
plt.plot(np.arange(len(y_test_actual)), y_test_actual, label='Actual Values', color='blue', alpha=0.7, linewidth=2)
plt.plot(np.arange(len(y_test_pred)), y_test_pred, label='Predicted Values', color='yellow', alpha=0.7, linewidth=2)
plt.xlabel('Index')
plt.ylabel('Dengue Cases')
plt.title('Actual vs Predicted Dengue Cases')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

# Assuming 'year' and 'week' are present in X_test
cities = X_test['city'].unique()

for city in cities:
    # Filter data for the city
    city_indices = X_test['city'] == city
    y_actual_city = y_test_actual[city_indices]
    y_pred_city = y_test_pred[city_indices]

    # Combine year and week into 'YYYY-WW' format
    weeks_city = X_test.loc[city_indices, 'year'].astype(str) + '-' + X_test.loc[city_indices, 'week'].astype(str)

    # Skip the city if the maximum actual dengue cases are less than 10
    if np.max(y_actual_city) < 10:
        continue

    # Compute RMSE for the city
    rmse_city = np.sqrt(mean_squared_error(y_actual_city, y_pred_city))
    print(f'City: {city}, RMSE: {rmse_city}')

    # Plot for the city with weeks on the x-axis
    plt.figure(figsize=(16, 8))
    plt.plot(weeks_city, y_actual_city, label='Actual Values', color='blue', alpha=0.7, linewidth=2)
    plt.plot(weeks_city, y_pred_city, label='Predicted Values', color='orange', alpha=0.7, linewidth=2)
    plt.xlabel('Week')
    plt.ylabel('Dengue Cases')
    plt.title(f'Actual vs Predicted Dengue Cases for {city}')
    plt.legend()
    plt.grid(True)

    # Select every 4th week for x-axis labels
    ticks = weeks_city.iloc[::4]  # Select every 4th week
    plt.xticks(ticks, rotation=45)  # Rotate x-axis labels for readability
    plt.show()
