# **DengAI: Predicting Disease Spread**


In [None]:
#Import necessary libraries
import pandas as pd
from prophet import Prophet
import warnings
warnings.filterwarnings("ignore") #to ignore the warnings
from prophet.plot import plot_plotly, plot_components_plotly
import numpy as np




#Reading Data


In [None]:
#import pandas as pd
train_df = pd.read_csv("dengue_features_train.csv")
label_df= pd.read_csv("dengue_labels_train.csv")
test_df = pd.read_csv("dengue_features_test.csv")
submission = pd.read_csv("submission_format.csv",index_col=[0, 1, 2])

In [None]:
# Create copies of the original DataFrames
df = train_df.copy()
df_label = label_df.copy()
df_test = test_df.copy()

In [None]:
print("train_df", df.shape)
print("label_df", df_label.shape)
print("test_df", df_test.shape)

train_df (1456, 24)
label_df (1456, 4)
test_df (416, 24)


# Data Preprocessing

In [None]:
def preprocess_data(data_path, labels_path=None):
    # load data
    df = pd.read_csv(data_path)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path)
        df = df.merge(labels['total_cases'], left_index=True, right_index=True)

    # Convert 'week_start_date' to datetime format and get the first date
    df['week_start_date'] = pd.to_datetime(df['week_start_date'])

    # fill missing values
    df.fillna(method='ffill', inplace=True)

    # separate san juan and iquitos
    sj = df[df['city'] == 'sj'].drop('city', axis=1)
    iq = df[df['city'] == 'iq'].drop('city', axis=1)

    return sj, iq

## Preprocess data for train data and label data

In [None]:
# Preprocess data for train data and label
sj_data, iq_data = preprocess_data(data_path = "dengue_features_train.csv", labels_path= "dengue_labels_train.csv")

In [None]:
print("sj_data", sj_data.shape)
print("iq_data", iq_data.shape)

sj_data (936, 24)
iq_data (520, 24)


In [None]:
sj_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 936 entries, 0 to 935
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   year                                   936 non-null    int64         
 1   weekofyear                             936 non-null    int64         
 2   week_start_date                        936 non-null    datetime64[ns]
 3   ndvi_ne                                936 non-null    float64       
 4   ndvi_nw                                936 non-null    float64       
 5   ndvi_se                                936 non-null    float64       
 6   ndvi_sw                                936 non-null    float64       
 7   precipitation_amt_mm                   936 non-null    float64       
 8   reanalysis_air_temp_k                  936 non-null    float64       
 9   reanalysis_avg_temp_k                  936 non-null    float64   

In [None]:
sj_data.isna().sum()

year                                     0
weekofyear                               0
week_start_date                          0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_precip_mm                        0
total_cases

In [None]:
iq_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 520 entries, 936 to 1455
Data columns (total 24 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   year                                   520 non-null    int64         
 1   weekofyear                             520 non-null    int64         
 2   week_start_date                        520 non-null    datetime64[ns]
 3   ndvi_ne                                520 non-null    float64       
 4   ndvi_nw                                520 non-null    float64       
 5   ndvi_se                                520 non-null    float64       
 6   ndvi_sw                                520 non-null    float64       
 7   precipitation_amt_mm                   520 non-null    float64       
 8   reanalysis_air_temp_k                  520 non-null    float64       
 9   reanalysis_avg_temp_k                  520 non-null    float64

In [None]:
iq_data.isna().sum()

year                                     0
weekofyear                               0
week_start_date                          0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_precip_mm                        0
total_cases

### Preprocess data for test data

In [None]:
# Preprocess datafor test data
sj_test_data, iq_test_data = preprocess_data(data_path = "dengue_features_test.csv")

In [None]:
print("sj_test_data", sj_test_data.shape)
print("iq_test_data", iq_test_data.shape)

sj_test_data (260, 23)
iq_test_data (156, 23)


In [None]:
sj_test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 260 entries, 0 to 259
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   year                                   260 non-null    int64         
 1   weekofyear                             260 non-null    int64         
 2   week_start_date                        260 non-null    datetime64[ns]
 3   ndvi_ne                                260 non-null    float64       
 4   ndvi_nw                                260 non-null    float64       
 5   ndvi_se                                260 non-null    float64       
 6   ndvi_sw                                260 non-null    float64       
 7   precipitation_amt_mm                   260 non-null    float64       
 8   reanalysis_air_temp_k                  260 non-null    float64       
 9   reanalysis_avg_temp_k                  260 non-null    float64   

In [None]:
sj_test_data.isna().sum()

year                                     0
weekofyear                               0
week_start_date                          0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_precip_mm                        0
dtype: int6

In [None]:
iq_test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156 entries, 260 to 415
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   year                                   156 non-null    int64         
 1   weekofyear                             156 non-null    int64         
 2   week_start_date                        156 non-null    datetime64[ns]
 3   ndvi_ne                                156 non-null    float64       
 4   ndvi_nw                                156 non-null    float64       
 5   ndvi_se                                156 non-null    float64       
 6   ndvi_sw                                156 non-null    float64       
 7   precipitation_amt_mm                   156 non-null    float64       
 8   reanalysis_air_temp_k                  156 non-null    float64       
 9   reanalysis_avg_temp_k                  156 non-null    float64 

In [None]:
iq_test_data.isna().sum()

year                                     0
weekofyear                               0
week_start_date                          0
ndvi_ne                                  0
ndvi_nw                                  0
ndvi_se                                  0
ndvi_sw                                  0
precipitation_amt_mm                     0
reanalysis_air_temp_k                    0
reanalysis_avg_temp_k                    0
reanalysis_dew_point_temp_k              0
reanalysis_max_air_temp_k                0
reanalysis_min_air_temp_k                0
reanalysis_precip_amt_kg_per_m2          0
reanalysis_relative_humidity_percent     0
reanalysis_sat_precip_amt_mm             0
reanalysis_specific_humidity_g_per_kg    0
reanalysis_tdtr_k                        0
station_avg_temp_c                       0
station_diur_temp_rng_c                  0
station_max_temp_c                       0
station_min_temp_c                       0
station_precip_mm                        0
dtype: int6

# Prophet


##San Juan

In [None]:
# !pip install Prophet

In [None]:
# Renaming columns to comply with Prophet's requirements for time series analysis
sj_data.rename(columns = {'week_start_date':'ds','total_cases':'y'}, inplace=True)
sj_test_data.rename(columns = {'week_start_date':'ds'}, inplace=True)

In [None]:
#from prophet import Prophet
# Initialize the Prophet model with specified hyperparameters
sj_model = Prophet(changepoint_prior_scale=0.1,
                   seasonality_prior_scale=10,
                   seasonality_mode='multiplicative')

# Add additional regressors to the model
sj_model.add_regressor('precipitation_amt_mm')  # Add precipitation amount as a regressor
sj_model.add_regressor('station_max_temp_c')    # Add maximum temperature as a regressor
sj_model.add_regressor('station_min_temp_c')    # Add minimum temperature as a regressor

# Fit the model on the training dataset (sj_data)
sj_model.fit(sj_data)

INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpnogeem3h/so0rlkm3.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpnogeem3h/pqtp8f45.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=31803', 'data', 'file=/tmp/tmpnogeem3h/so0rlkm3.json', 'init=/tmp/tmpnogeem3h/pqtp8f45.json', 'output', 'file=/tmp/tmpnogeem3h/prophet_model8inr7i_z/prophet_model-20240214121516.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
12:15:16 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
12:15:17 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


<prophet.forecaster.Prophet at 0x7ef47591e7a0>

In [None]:
# Use the trained Prophet model (sj_model) to make predictions on the dataset (sj_data)
sj_data_forecast = sj_model.predict(sj_data)

# Select the relevant columns ('ds' for dates, 'yhat' for predicted values, 'yhat_lower' and 'yhat_upper' for uncertainty intervals)
sj_data_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
0,1990-04-30,-1.024264,-36.059677,32.759
1,1990-05-07,1.202011,-37.034857,37.133089
2,1990-05-14,2.20528,-29.593952,36.653831
3,1990-05-21,3.826355,-31.491801,42.463551
4,1990-05-28,5.790908,-30.463901,41.140069


In [None]:
#from prophet.plot import plot_plotly, plot_components_plotly
plot_plotly(sj_model, sj_data_forecast)

In [None]:
# Use the trained Prophet model (sj_model) to make predictions on the test dataset (sj_test_data)
sj_test_data_forecast = sj_model.predict(sj_test_data)

# Select the relevant columns ('ds' for dates, 'yhat' for predicted values, 'yhat_lower' and 'yhat_upper' for uncertainty intervals)
sj_test_data_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()


Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
0,2008-04-29,4.728549,-30.089841,38.186197
1,2008-05-06,-0.692178,-34.714541,36.55967
2,2008-05-13,6.395853,-29.213681,40.027284
3,2008-05-20,8.362958,-28.095331,46.056501
4,2008-05-27,8.562768,-28.756046,45.585058


In [None]:
#from prophet.plot import plot_plotly, plot_components_plotly
plot_plotly(sj_model, sj_test_data_forecast)

##Iquitos

In [None]:
# Renaming columns to comply with Prophet's requirements for time series analysis
iq_data.rename(columns = {'week_start_date':'ds','total_cases':'y'}, inplace=True)
iq_test_data.rename(columns = {'week_start_date':'ds'}, inplace=True)

In [None]:
# from prophet import Prophet
# Initialize the Prophet model with specified hyperparameters
iq_model = Prophet(changepoint_prior_scale=0.05,
                   seasonality_prior_scale=0.1,
                   seasonality_mode='multiplicative')

# Add additional regressors to the model
iq_model.add_regressor('precipitation_amt_mm')  # Add precipitation amount as a regressor
iq_model.add_regressor('station_max_temp_c')    # Add maximum temperature as a regressor
iq_model.add_regressor('station_min_temp_c')    # Add minimum temperature as a regressor

# Fit the model on the training dataset (iq_data)
iq_model.fit(iq_data)

INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmpnogeem3h/xvbf4gcv.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpnogeem3h/issnm_on.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=31852', 'data', 'file=/tmp/tmpnogeem3h/xvbf4gcv.json', 'init=/tmp/tmpnogeem3h/issnm_on.json', 'output', 'file=/tmp/tmpnogeem3h/prophet_modeldub8dxck/prophet_model-20240214121540.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
12:15:40 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
12:15:40 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


<prophet.forecaster.Prophet at 0x7ef47589e6b0>

In [None]:
# Use the trained Prophet model (iq_model) to make predictions on the dataset (iq_data)
iq_data_forecast = iq_model.predict(iq_data)

# Select the relevant columns ('ds' for dates, 'yhat' for predicted values, 'yhat_lower' and 'yhat_upper' for uncertainty intervals)
iq_data_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
0,2000-07-01,0.632055,-11.536609,12.566457
1,2000-07-08,0.782832,-11.105271,13.14715
2,2000-07-15,0.828214,-10.710967,13.921856
3,2000-07-22,0.234938,-11.752729,12.618452
4,2000-07-29,0.76406,-11.027811,12.567206


In [None]:
#from prophet.plot import plot_plotly, plot_components_plotly
plot_plotly(iq_model, iq_data_forecast)

In [None]:
# Use the trained Prophet model (iq_model) to make predictions on the test dataset (iq_test_data)
iq_test_data_forecast = iq_model.predict(iq_test_data)

# Select the relevant columns ('ds' for dates, 'yhat' for predicted values, 'yhat_lower' and 'yhat_upper' for uncertainty intervals)
iq_test_data_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
0,2010-07-02,5.449388,-6.627354,17.046471
1,2010-07-09,5.339308,-6.739953,17.604816
2,2010-07-16,6.118309,-5.738638,18.492245
3,2010-07-23,1.659096,-11.162941,14.586389
4,2010-07-30,3.371967,-8.748472,15.173642


In [None]:
#from prophet.plot import plot_plotly, plot_components_plotly
plot_plotly(iq_model, iq_test_data_forecast)

# Submission

In [None]:
# Create copies of the original DataFrames
df_submission = submission.copy()
df_submission.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_cases
city,year,weekofyear,Unnamed: 3_level_1
sj,2008,18,0
sj,2008,19,0
sj,2008,20,0
sj,2008,21,0
sj,2008,22,0


In [None]:
#import numpy as np
## Add the forecasted total cases from both San Juan and Iquitos to the DataFrame
df_submission["total_cases"] = np.concatenate([sj_test_data_forecast["yhat"], iq_test_data_forecast["yhat"]])

df_submission.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_cases
city,year,weekofyear,Unnamed: 3_level_1
sj,2008,18,4.728549
sj,2008,19,-0.692178
sj,2008,20,6.395853
sj,2008,21,8.362958
sj,2008,22,8.562768


In [None]:
# Convert the 'total_cases' column to integers
df_submission["total_cases"] = df_submission["total_cases"].astype(int)
df_submission.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_cases
city,year,weekofyear,Unnamed: 3_level_1
sj,2008,18,4
sj,2008,19,0
sj,2008,20,6
sj,2008,21,8
sj,2008,22,8
sj,2008,23,7
sj,2008,24,5
sj,2008,25,15
sj,2008,26,14
sj,2008,27,11


In [None]:
# Save the DataFrame to a CSV file named "Submission_Prophet.csv"
df_submission.to_csv('Submission_Prophet.csv')

 submission score is 25.8582