In [173]:
library(zoo)
library(dplyr)
library("scales")
library(glmnet)

In [174]:
#read csv data
df_train_features <- read.csv(file="dengue_features_train.csv")

df_train_labels <- read.csv(file="dengue_labels_train.csv")

df_test_features <- read.csv(file="dengue_features_test.csv")


In [175]:
#drop week_start_date colum

df_train_features$week_start_date <- NULL
df_train_labels$week_start_date <- NULL

#change Kelvin Columns to Celcius
df_train_features$reanalysis_min_air_temp_k <- (df_train_features$reanalysis_min_air_temp_k - 273.15)

df_train_features$reanalysis_max_air_temp_k <- (df_train_features$reanalysis_max_air_temp_k - 273.15)

df_train_features$reanalysis_dew_point_temp_k <- (df_train_features$reanalysis_dew_point_temp_k - 273.15)

df_train_features$reanalysis_air_temp_k <- (df_train_features$reanalysis_air_temp_k - 273.15)

#split the data by city

df_train_features_sj <- subset(df_train_features, subset=city=='sj')
df_train_features_iq <- subset(df_train_features, subset=city=='iq')

df_train_labels_sj <- subset(df_train_labels, subset=city=='sj')
df_train_labels_iq <- subset(df_train_labels, subset=city=='iq')

#drop city column

df_train_features_sj <- dplyr::select(df_train_features_sj, -city)
df_train_features_iq <- dplyr::select(df_train_features_iq, -city)

#fill null values with the mean value of the column
df_train_features_sj = na.aggregate(df_train_features_sj)
df_train_features_iq = na.aggregate(df_train_features_iq)

#drop the correlating features for the San Juan training data
df_train_features_sj <- dplyr::select(df_train_features_sj, -reanalysis_avg_temp_k)
df_train_features_sj <- dplyr::select(df_train_features_sj, -reanalysis_sat_precip_amt_mm)
df_train_features_sj <- dplyr::select(df_train_features_sj, -reanalysis_specific_humidity_g_per_kg)



In [176]:
#scale data in range from 0 to 1
df_train_features_sj <- apply(df_train_features_sj, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
df_train_features_iq <- apply(df_train_features_iq, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))

city,year,weekofyear,total_cases
sj,1990,18,4
sj,1990,19,5
sj,1990,20,4
sj,1990,21,3
sj,1990,22,6
sj,1990,23,2
sj,1990,24,4
sj,1990,25,5
sj,1990,26,10
sj,1990,27,6


In [177]:
x <- as.matrix(df_train_features_sj)
y <- as.double(as.matrix(df_train_labels_sj[, 4])) 


In [178]:
#Ridge regression

set.seed(1)
cv.out = cv.glmnet(x, y, alpha = 0, type.measure = "mae") # Fit ridge regression model on training data
bestlam = cv.out$lambda.min  # Select lamda that minimizes training MSE
bestlam

ridge_pred = predict(ridge_mod, s = bestlam, newx = x)


In [182]:
score   <-  mae(df_train_labels_sj$total_cases - ridge_pred)

In [180]:
# function that returns Mean Absolute Error
mae <- function(error) return(mean(abs(error)) )

get_bst_model <- function(train, test)
{ 
  # Step 2: Find the best hyper parameter, alpha
  for (i in grid)
    {
      model = glm.nb(formula = form,
                     data = train,
                     init.theta = i)
    
      results <-  predict(model, test)
      score   <-  mae(test$total_cases - results)
      
      if (score < best_score) {
          best_alpha <- i
          best_score <- score
          cat('\nbest score = ', best_score, '\twith alpha = ', best_alpha)
        }
  }
  
  # Step 3: refit on entire dataset
  combined <- rbind(train, test)
  combined_model = glm.nb(formula=form,
                          data = combined,
                          init.theta = best_alpha)
  
  return (combined_model)
}