In [1]:
import sys
import os

# Get current working directory (e.g., notebook location)
notebook_dir = os.getcwd()

# Construct the path to the correct directory
parent_dir = os.path.abspath(os.path.join(notebook_dir, '../src'))

# Add it to sys.path
sys.path.insert(0, parent_dir)

# Now you can import the module
from functions.loading import load_csv
from functions.merging import merge_dataframes
from functions.imputation import impute_with_mean
from functions.remerge import remerge
from functions.train import train_model
from functions.test import test_model, get_min_from_csv
from functions.submission import create_submission
from functions.encoding import removal_nonnumeric_columns
from functions.split import split_data
from functions.predict import make_prediction
import pandas as pd
import numpy as np

In [2]:
# Read data 
train_features = load_csv("../data/01_raw/dengue_features_train.csv")
train_labels = load_csv("../data/01_raw/dengue_labels_train.csv")
test_features = load_csv("../data/01_raw/dengue_features_test.csv")

df = merge_dataframes(train_features, train_labels, test_features, on=["city", "weekofyear", "year"])

df.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5.0
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4.0
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6.0


In [3]:
df.describe()

Unnamed: 0,year,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
count,1872.0,1872.0,1635.0,1809.0,1849.0,1849.0,1857.0,1860.0,1860.0,1860.0,...,1860.0,1857.0,1860.0,1860.0,1817.0,1817.0,1849.0,1849.0,1845.0,1456.0
mean,2003.194979,26.489316,0.138588,0.129713,0.204663,0.202174,44.109278,298.72777,299.253955,295.284823,...,82.237158,44.109278,16.786638,4.952903,27.226651,8.004112,32.470795,22.16079,38.201897,24.675137
std,6.292268,15.006319,0.146413,0.125082,0.075064,0.085768,42.065868,1.38741,1.27251,1.528046,...,7.203803,42.065868,1.547336,3.545887,1.281254,2.205734,1.950483,1.613325,44.96136,43.596
min,1990.0,1.0,-0.4634,-0.4561,-0.015533,-0.063457,0.0,294.554286,294.892857,289.642857,...,57.787143,0.0,11.715714,1.357143,21.4,4.042857,26.7,14.2,0.0,0.0
25%,1999.0,13.75,0.0353,0.041225,0.153457,0.141014,9.43,297.678929,298.276786,294.164643,...,77.214643,9.43,15.590714,2.357143,26.328571,6.357143,31.1,21.1,8.8,5.0
50%,2004.0,26.0,0.127317,0.1152,0.197029,0.188229,36.64,298.617143,299.3,295.667143,...,80.307143,36.64,17.137143,2.871429,27.433333,7.2,32.8,22.2,23.8,12.0
75%,2008.0,39.0,0.252225,0.22336,0.250857,0.248086,67.5,299.891429,300.271429,296.502857,...,86.799286,67.5,18.028929,7.757143,28.2,9.625,33.9,23.3,51.5,28.0
max,2013.0,53.0,0.508357,0.649,0.538314,0.546017,390.6,302.2,303.328571,298.45,...,98.61,390.6,20.461429,16.028571,30.8,15.8,42.2,26.7,543.3,461.0


# Cyclical Encoding

In [4]:
# Ensure date is in datetime format
df['week_start_date'] = pd.to_datetime(df['week_start_date'])

# Extract date components
df['month'] = df['week_start_date'].dt.month
df['weekofyear'] = df['week_start_date'].dt.isocalendar().week
df['dayofweek'] = df['week_start_date'].dt.dayofweek

# Cyclical encoding
def add_cyclical_features(df, col, period):
    df[f'{col}_sin'] = np.sin(2 * np.pi * df[col] / period)
    df[f'{col}_cos'] = np.cos(2 * np.pi * df[col] / period)

add_cyclical_features(df, 'month', 12)
add_cyclical_features(df, 'weekofyear', 52)
add_cyclical_features(df, 'dayofweek', 7)

# Lag Features

In [None]:
# Columns to create lag/rolling features for
lag_features = [
    'station_max_temp_c', 'station_min_temp_c', 'station_avg_temp_c', 'station_precip_mm',
    'station_diur_temp_rng_c', 'precipitation_amt_mm', 'reanalysis_sat_precip_amt_mm',
    'reanalysis_dew_point_temp_k', 'reanalysis_air_temp_k', 'reanalysis_relative_humidity_percent',
    'reanalysis_specific_humidity_g_per_kg', 'reanalysis_precip_amt_kg_per_m2',
    'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k', 'reanalysis_avg_temp_k',
    'reanalysis_tdtr_k', 'ndvi_se', 'ndvi_sw', 'ndvi_ne', 'ndvi_nw'
]

lags = [1, 2, 3]
roll_windows = [3, 5]

# Sort by city and time
df = df.sort_values(['city', 'week_start_date'])

# Apply lags and rolling means per city
for col in lag_features:
    for lag in lags:
        df[f'{col}_lag{lag}'] = df.groupby('city')[col].shift(lag)
    for window in roll_windows:
        df[f'{col}_rollmean{window}'] = df.groupby('city')[col].transform(lambda x: x.rolling(window).mean())

# Sort by time
df.sort_values(by=['city', 'week_start_date'], ascending=[False, True], inplace=True)

  df[f'{col}_rollmean{window}'] = df.groupby('city')[col].transform(lambda x: x.rolling(window).mean())
  df[f'{col}_lag{lag}'] = df.groupby('city')[col].shift(lag)
  df[f'{col}_lag{lag}'] = df.groupby('city')[col].shift(lag)
  df[f'{col}_lag{lag}'] = df.groupby('city')[col].shift(lag)
  df[f'{col}_rollmean{window}'] = df.groupby('city')[col].transform(lambda x: x.rolling(window).mean())
  df[f'{col}_rollmean{window}'] = df.groupby('city')[col].transform(lambda x: x.rolling(window).mean())
  df[f'{col}_lag{lag}'] = df.groupby('city')[col].shift(lag)
  df[f'{col}_lag{lag}'] = df.groupby('city')[col].shift(lag)
  df[f'{col}_lag{lag}'] = df.groupby('city')[col].shift(lag)
  df[f'{col}_rollmean{window}'] = df.groupby('city')[col].transform(lambda x: x.rolling(window).mean())
  df[f'{col}_rollmean{window}'] = df.groupby('city')[col].transform(lambda x: x.rolling(window).mean())


# Train and test

In [6]:
df = impute_with_mean(df, "total_cases")

df = removal_nonnumeric_columns(df)

train_X, test_X, train_y, test_y,validation = split_data(df)

model = train_model(train_X, train_y)

current_MAE = test_model(model, test_X, test_y)
# baseline: 14.629001914370031
# with new features: 11.263847476740525

Mean absolute Error is 14.140981322443796


FileNotFoundError: [Errno 2] No such file or directory: 'logs/MAEs.csv'

In [7]:
#Submission
X = remerge(train_X, test_X)
y = remerge(train_y, test_y)

final_model = train_model(X, y)
predictions = make_prediction(validation, final_model)

create_submission(predictions, validation, "lag_roll_features_30_04_1816.csv")


Submission file created at lag_roll_features_30_04_1816.csv
