In [2]:
library(zoo)
library(dplyr)
library("scales")
library(glmnet)
library(caret)

In [3]:
#read csv data
df_train_features <- read.csv(file="dengue_features_train.csv")

df_train_labels <- read.csv(file="dengue_labels_train.csv")

df_test_features <- read.csv(file="dengue_features_test.csv")


In [5]:
#drop week_start_date colum

df_train_features$week_start_date <- NULL
df_train_labels$week_start_date <- NULL

#change Kelvin Columns to Celcius
df_train_features$reanalysis_min_air_temp_k <- (df_train_features$reanalysis_min_air_temp_k - 273.15)

df_train_features$reanalysis_max_air_temp_k <- (df_train_features$reanalysis_max_air_temp_k - 273.15)

df_train_features$reanalysis_dew_point_temp_k <- (df_train_features$reanalysis_dew_point_temp_k - 273.15)

df_train_features$reanalysis_air_temp_k <- (df_train_features$reanalysis_air_temp_k - 273.15)

#split the data by city

df_train_features_sj <- subset(df_train_features, subset=city=='sj')
df_train_features_iq <- subset(df_train_features, subset=city=='iq')

df_train_labels_sj <- subset(df_train_labels, subset=city=='sj')
df_train_labels_iq <- subset(df_train_labels, subset=city=='iq')

#drop city column

df_train_features_sj <- dplyr::select(df_train_features_sj, -city)
df_train_features_iq <- dplyr::select(df_train_features_iq, -city)

#fill null values with the mean value of the column
df_train_features_sj = na.aggregate(df_train_features_sj)
df_train_features_iq = na.aggregate(df_train_features_iq)

#drop the correlating features for the San Juan training data
df_train_features_sj <- dplyr::select(df_train_features_sj, -reanalysis_avg_temp_k)
df_train_features_sj <- dplyr::select(df_train_features_sj, -reanalysis_sat_precip_amt_mm)
df_train_features_sj <- dplyr::select(df_train_features_sj, -reanalysis_specific_humidity_g_per_kg)



In [6]:
#scale data in range from 0 to 1
df_train_features_sj <- apply(df_train_features_sj, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
df_train_features_iq <- apply(df_train_features_iq, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))

In [47]:
x <- as.matrix(df_train_features_sj)
y <- as.matrix(as.matrix(df_train_labels_sj[, 4])) 


## 75% of the sample size
smp_size <- floor(0.75 * nrow(x))
smp_size_y <- floor(0.75 * nrow(y))
## set the seed to make your partition reproducible
set.seed(2)
train_ind <- sample(seq_len(nrow(x)), size = smp_size)
train_ind_y <- sample(seq_len(nrow(y)), size = smp_size_y)
 
x_train <- x[train_ind, ]
x_test <- x[-train_ind, ]

y_train <- y[train_ind_y, ]
y_test <- y[-train_ind_y, ]



In [49]:
s <- cbind(x, total_cases = y)

In [52]:
colnames(s)[20] <- "total_cases"

In [63]:
#Randomly shuffle the data
yourData<-s[sample(nrow(s)),]

#Create 10 equally size folds
folds <- cut(seq(1,nrow(s)),breaks=10,labels=FALSE)

#Perform 10 fold cross validation
for(i in 1:10){
    #Segement your data by fold using the which() function 
    testIndexes <- which(folds==i,arr.ind=TRUE)
    testData <- s[testIndexes, ]
    trainData <- s[-testIndexes, ]
   
}

In [9]:
#Ridge regression

set.seed(1)
cv.out = cv.glmnet(x, y ,alpha = 0, type.measure = "mae") 
bestlam = cv.out$lambda.min  

ridge_mod = glmnet(x_train, 
                   y_train, 
                   alpha = 0, 
                   lambda = bestlam)

ridge_pred = predict(ridge_mod, s = bestlam, newx = x_test)


mae <- function(error) return(mean(abs(error)) )

score   <-  mae(y_test - ridge_pred)

sqrt(score)

In [10]:
#Lasso regression

set.seed(1)
cv.out = cv.glmnet(x, y, alpha = 1, type.measure = "mae") 

bestlam = cv.out$lambda.min  

lasso_mod = glmnet(x_train, 
                   y_train, 
                   alpha = 1, 
                   lambda = bestlam)
lasso_pred = predict(lasso_mod, s = bestlam, newx = x_test)


score   <-  mae(y_test - lasso_pred)
sqrt(score) 

In [9]:
knn_mod= knnreg(x_train,y_train, k = 5)
knn_pred = predict(knn_mod, newdata = x_test)

score   <-  mae(y_test - knn_pred)
sqrt(score)