In [26]:
ModelsDir <- '/home/kate/Research/Property/Models/'
DataDir <- '/home/kate/Research/Property/Data/'
ModelName <- "wc_poissonglm_ofst_ecy"
UseSavedIfExists <- FALSE

In [27]:
source('/home/kate/code/Utils/MyFunctions.R')

In [28]:
training_dataset <- read.csv(paste(DataDir,"property_wcf_training.csv", sep = ""), header=TRUE)
testing_dataset <- read.csv(paste(DataDir,"property_wcf_testing.csv", sep = ""), header=TRUE)
#prediction_dataset <- read.csv(paste(DataDir,"property_water_claims_non_cat_fs_v5.csv", sep = ""), header=TRUE)

In [29]:
formula <- cova_ic_nc_water ~ log_sqft + usagetype + log_property_age + log_water_risk_3_blk + cova_deductible + customer_cnt_active_policies 

In [30]:
kfold <- 5

In [31]:
models_attr <- data.frame()
models_coef <- data.frame()
training_dataset$poissonglm_ofst_ecy_cv <- 0
testing_dataset$poissonglm_ofst_ecy_cv <- 0
for (fold in 1:kfold-1){
  training_dataset[paste("poissonglm_ofst_ecy_",fold,sep="")] <- 0 
  testing_dataset[paste("poissonglm_ofst_ecy_",fold,sep="")] <- 0  
    }
# 
#CV-Folds
AIC_l <- list()
BIC_l <- list()
logLik_l <- list()
NWG_Test <- list()
NWG_Train <- list()
coef_l <- list()
folds_l <- list() 
i <- 1
for (fold in 1:kfold-1){
    print(paste("Fold ",fold," of ",kfold," currently processing...",sep=""))
    training_dataset_fold <- training_dataset[training_dataset[paste("fold",fold,sep="_")] > 0,]
    validation_dataset <- training_dataset[training_dataset[paste("fold",fold,sep="_")] == 0,]
    #Modeling
    ModelFile <- paste(ModelsDir,ModelName,"_",fold,".rds",sep="")
    if(file.exists(ModelFile) && UseSavedIfExists){
        poissonglm <- readRDS(ModelFile)
    } else {
        poissonglm <- glm(formula, family = "poisson", data = training_dataset_fold,  offset=log_ecy)
        saveRDS(poissonglm, ModelFile)             
    }#if modeling 
    #Training Prediction
    pred_train <- predict(poissonglm,training_dataset, type="response")
    train_fold_NWG <- NormalizedWeightedGini(training_dataset$cova_ic_nc_water,pred_train,training_dataset$ecy)
    training_dataset[paste("poissonglm_ofst_ecy_",fold,sep="")] <- pred_train
    training_dataset$poissonglm_ofst_ecy_cv <- training_dataset$poissonglm_ofst_ecy_cv + pred_train/kfold
    #Testing Prediction
    pred_test <- predict(poissonglm,testing_dataset, type="response")
    test_fold_NWG <- NormalizedWeightedGini(testing_dataset$cova_ic_nc_water,pred_test,testing_dataset$ecy)
    testing_dataset$poissonglm_ofst_ecy_cv <- testing_dataset$poissonglm_ofst_ecy_cv + pred_test/kfold 
    testing_dataset[paste("poissonglm_ofst_ecy_",fold,sep="")] <- pred_test
    #Model attributes
    folds_l[[i]] <- i
    AIC_l[[i]] <- AIC(poissonglm)
    BIC_l[[i]] <- BIC(poissonglm)  
    logLik_l[[i]] <- logLik(poissonglm) 
    NWG_Train[[i]] <- train_fold_NWG    
    NWG_Test[[i]] <- test_fold_NWG
    #Model coeficients
    model_coef_fold <- data.frame(
        fold = fold+1,
        pred_name = summary(poissonglm)$coefficients[,0],
        estimate =  summary(poissonglm)$coefficients[,1],
        StdError =  summary(poissonglm)$coefficients[,2],  
        zValue =   summary(poissonglm)$coefficients[,3],     
        valuePr =   summary(poissonglm)$coefficients[,4]    
    )  
    #pred_name is an index, row name, convert it into column
    model_coef_fold <- cbind(name = rownames(model_coef_fold), model_coef_fold)
    rownames(model_coef_fold) <- 1:nrow(model_coef_fold) 
    #main table
    models_coef <- rbind(models_coef,model_coef_fold)         
    i <- i + 1
    }#folds loop   
    #Model attributes data frame
    models_fold_attr <- data.frame( 
        fold = unlist(folds_l),
        AIC = unlist(AIC_l),
        BIC = unlist(BIC_l),   
        logLik = unlist(logLik_l),
        TrainNormalizedWeightedGini = unlist(NWG_Train),     
        TestNormalizedWeightedGini = unlist(NWG_Test)
    )
    models_attr <- rbind(models_attr,models_fold_attr)                 

[1] "Fold 0 of 5 currently processing..."
[1] "Fold 1 of 5 currently processing..."
[1] "Fold 2 of 5 currently processing..."
[1] "Fold 3 of 5 currently processing..."
[1] "Fold 4 of 5 currently processing..."


In [32]:
head(models_attr)

Unnamed: 0_level_0,fold,AIC,BIC,logLik,TrainNormalizedWeightedGini,TestNormalizedWeightedGini
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,92317.44,92462.31,-46146.72,0.3786799,0.4081385
2,2,92202.52,92347.39,-46089.26,0.378682,0.4089209
3,3,92297.82,92442.68,-46136.91,0.3787525,0.4088506
4,4,92269.46,92414.32,-46122.73,0.3788235,0.4091047
5,5,92432.24,92577.1,-46204.12,0.3782476,0.408287


In [33]:
write.table(models_attr,paste(ModelsDir,ModelName,"_attr.csv", sep = ""), sep=",",  col.names=TRUE, row.names = FALSE)

In [34]:
head(models_coef)

Unnamed: 0_level_0,name,fold,estimate,StdError,zValue,valuePr
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,(Intercept),1,-15.4860522,1.03826855,-14.915266,2.6222819999999997e-50
2,log_sqft,1,0.6986686,0.03119358,22.39783,4.132172e-111
3,usagetypePRIMARY,1,3.4031129,1.00012814,3.402677,0.0006672916
4,usagetypeRENTAL,1,3.0994353,1.00028965,3.098538,0.001944782
5,usagetypeSEASONAL,1,3.0556478,1.01841286,3.000402,0.002696236
6,usagetypeSECONDARY,1,1.9151446,1.15471427,1.658544,0.09720769


In [35]:
write.table(models_coef,paste(ModelsDir,ModelName,"_coef.csv", sep = ""), sep=",",  col.names=TRUE, row.names = FALSE)

Normalized Weighted gini Training

In [36]:
NormalizedWeightedGini(training_dataset$cova_ic_nc_water,training_dataset$poissonglm_ofst_ecy_cv,training_dataset$ecy)

Normalized Weighted gini Testing

In [37]:
NormalizedWeightedGini(testing_dataset$cova_ic_nc_water,testing_dataset$poissonglm_ofst_ecy_cv,testing_dataset$ecy)

In [13]:
write.table(training_dataset,paste(DataDir,"property_wcf_training.csv", sep = ""), sep=",",  col.names=TRUE, row.names = FALSE)
write.table(testing_dataset,paste(DataDir,"property_wcf_testing.csv", sep = ""), sep=",",  col.names=TRUE, row.names = FALSE)
write.table(prediction_dataset,paste(DataDir,"property_water_claims_non_cat_fs.csv", sep = ""), sep=",",  col.names=TRUE, row.names = FALSE)