In [None]:
library(MASS) #Efficient Statistical Computing
library(ggplot2) #Easy Data Visualization
library(readr) #Faster Data Inputs and Outputs
library(dplyr) #Easy Data Wrangling
library(statmod) #For Model Diagnostics

In [None]:
#setwd("......") #Please set your working directory to the folder where you have stored the data for today's session

In [None]:
train <- read_csv("./train.csv")
test <- read_csv("./test.csv")

In [None]:
summary(train) #Summary Statistics of All Variables

str(train) #Check Variable Types

colnames(train) #Variable Names

train_sev <- train %>%
  filter(severity > 0)

#Histogram of Claim Amounts
truehist(train$ClaimAmount)

#Histogram of Log of Claim Amounts
truehist(log(train$ClaimAmount + 1))

#Histogram of Non-Zero Claim Amounts
truehist(train[train$ClaimAmount > 0, ]$ClaimAmount)

#Bar Chart of Claim Counts
truehist(train$ClaimNb)

##Frequency Modelling

#Starting with Just Driver Age Band and Vehicle Age Band
glm_freq1 <- glm(formula = ClaimNb ~ DrivAgeBand + VehAgeBand, data = train, family = poisson(link = "log"))
summary(glm_freq1)

resid <- qresiduals(glm_freq1)
resid <- ifelse(is.na(resid) | is.infinite(resid), residuals(glm_freq1), resid)

qqnorm(resid)
qqline(resid, col="blue",lwd=2)

plot(predict(glm_freq1, train, type = "response"), resid)
abline(h = 0, col = "red")

test$ClaimNb_Predict1 <- predict(glm_freq1, test, type = "response")

#Adding Exposure
glm_freq2 <- glm(formula = ClaimNb ~ DrivAgeBand + VehAgeBand, data = train, family = poisson(link = "log"), 
                 offset = log(Exposure))
summary(glm_freq2)

resid <- qresiduals(glm_freq2)
resid <- ifelse(is.na(resid) | is.infinite(resid), residuals(glm_freq1), resid)

qqnorm(resid)
qqline(resid, col="blue",lwd=2)

plot(predict(glm_freq2, train, type = "response"), resid)
abline(h = 0, col = "red")

test$ClaimNb_Predict2 <- predict(glm_freq2, test, type = "response")

#Adding VehBrand
glm_freq3 <- glm(formula = ClaimNb ~ DrivAgeBand + VehAgeBand + VehBrand, data = train, family = poisson(link = "log"), 
                 offset = log(Exposure))
summary(glm_freq3)

resid <- qresiduals(glm_freq3)
resid <- ifelse(is.na(resid) | is.infinite(resid), residuals(glm_freq1), resid)

qqnorm(resid)
qqline(resid, col="blue",lwd=2)

plot(predict(glm_freq3, train, type = "response"), resid)
abline(h = 0, col = "red")

test$ClaimNb_Predict3 <- predict(glm_freq3, test, type = "response")

#Comparing All Models
summary(test$ClaimNb)
summary(test$ClaimNb_Predict1)
summary(test$ClaimNb_Predict2)
summary(test$ClaimNb_Predict3)

##Severity Modelling

#Basic Model - Vehicle Age Band and Driver Age Band
glm_sev1 <- glm(formula = severity ~ DrivAgeBand + VehAgeBand, data = train_sev, family = Gamma(link = "log"))
summary(glm_sev1)

resid <- qresiduals(glm_sev1)
resid <- ifelse(is.na(resid) | is.infinite(resid), residuals(glm_freq1), resid)

qqnorm(resid)
qqline(resid, col="blue",lwd=2)

plot(predict(glm_sev1, train_sev, type = "response"), resid)
abline(h = 0, col = "red")

test$severity_predict1 <- predict(glm_sev1, test, type = "response")

#Add Claim Count as Weight
glm_sev2 <- glm(formula = severity ~ DrivAgeBand + VehAgeBand, 
                 data = train_sev, family = Gamma(link = "log"), weight = ClaimNb)
summary(glm_sev2)

resid <- qresiduals(glm_sev2)
resid <- ifelse(is.na(resid) | is.infinite(resid), residuals(glm_freq1), resid)

qqnorm(resid)
qqline(resid, col="blue",lwd=2)

plot(predict(glm_sev2, train_sev, type = "response"), resid)
abline(h = 0, col = "red")

test$severity_predict2 <- predict(glm_sev2, test, type = "response")

#Add Vehicle Brand
glm_sev3 <- glm(formula = severity ~ DrivAgeBand + VehAgeBand + VehBrand, 
                data = train_sev, family = Gamma(link = "log"), weight = ClaimNb)
summary(glm_sev3)

resid <- qresiduals(glm_sev3)
resid <- ifelse(is.na(resid) | is.infinite(resid), residuals(glm_freq1), resid)

qqnorm(resid)
qqline(resid, col="blue",lwd=2)

plot(predict(glm_sev3, train_sev, type = "response"), resid)
abline(h = 0, col = "red")

test$severity_predict3 <- predict(glm_sev3, test, type = "response")

#Comparing Both Models
summary(test$severity)
summary(test[test$severity > 0, ]$severity)
summary(test$severity_predict1)
summary(test$severity_predict2)
summary(test$severity_predict3)

write_csv(test, "test_w_predictions.csv")