6_Haul_QA

###################
# Version Control #
###################
# R version 3.2.2 (2015-08-14): Fire Safety 
# platform       x86_64-w64-mingw32 (64-bit)
###############
# Script Info #
###############
# This is Script 6 of X
# This Script follows the guidlines outlined in Section 4.1 HH Data - Haul Summary Information
# from: Moriarty and Greenstreet 2016 
# The next step is to model missing data in the haul data of the surveys, and plot 
# lots of graphs to check all the haul stuff is believable now!

# AUTHOR: Meadhbh Moriarty, 2016
# REVIEWED BY: Nicola Walker (Cefas) Jonas Hentati Sundberg (SLU)
########################
# Check all -9 are ~NA #
########################
# All -9 should be NA as -9 represents a missing value in DATRAS
# Funtion to replace multiple -9 across lots of data tables
hauls<-hauls1
system.time(replace_function(hauls))
names(hauls)
summary(hauls)
# resturcture data table correctly
numCols <- c("SweepLngt", "HaulNo", "HaulDur", "ShootLat", "ShootLong", "HaulLat",
             "HaulLong", "Depth", "Netopening",  "Distance", "Warplngt", 
             "DoorSpread", "WingSpread", "GroundSpeed" , "SpeedWater")
hauls[, (numCols) := lapply(.SD, as.numeric), .SDcols = numCols]
##########################
# 4.1.1 Sample Location #
#########################
# check shoot and haul match ICES Stat Rec
summary(as.factor(hauls$Stratum))
# 37283 NA's
summary(as.factor(hauls$StatRec))
# 113 NA's
# ices<-read.csv("./Regional Boundaries Maps and Data/ICES_rectangles_statistics/Ices_rect_table.csv")
# names(ices)
# shoot longs that are NA should be -9 , silly DATRAS -9 thing messing stuff up here!
long_error<-hauls[is.na(hauls$ShootLong)]
list<-long_error$UniqueID
hauls$ShootLong[hauls$UniqueID%in%list]<--9
# check haul positions match ICES Stats
names(hauls)
lat<-hauls$ShootLat
hauls$check_StatRec<- ices.rect2(hauls$ShootLong, hauls$ShootLat)
summary(as.factor(hauls$check_StatRec))
summary(as.factor(hauls$Survey))
check<-subset(hauls, !(hauls$check_StatRec%in%hauls$StatRec))
check<-hauls[which(hauls$check_StatRec!=hauls$StatRec),]
check<-subset(hauls, is.na(hauls$Stratum),)
summary(as.factor(check$Survey))
summary(as.factor(hauls$check_StatRec))
# use Check StatRec col = mistakes in actual Stat Rec.
# some mis-matches - use check_StatRec for correct Stat Rec
# need to assign stratum based on survey!
# need maps
# New BaseMap
require(rgdal)
#load in basemap files for all maps
#set directory to working folder and insure subfolders are availible
europe<-readOGR("./Regional Boundaries Maps and Data/shapes//europe.dbf","europe")
contour_1000m<-readOGR("./Regional Boundaries Maps and Data/shapes//1000m.dbf","1000m")
NWW_boundary<-readOGR("./Regional Boundaries Maps and Data/shapes//NWW_boundary.dbf","NWW_boundary")
contour_200m<-readOGR(".//Regional Boundaries Maps and Data/shapes//200m.dbf","200m")
ospar<-readOGR("./Regional Boundaries Maps and Data/ospar_boundaries//OSPAR_inner_Boundary.dbf", "OSPAR_inner_Boundary")
spain<-readOGR("./Regional Boundaries Maps and Data/Sp-NGFS.WGS84//Sp_North.WGS84.dbf", "Sp_North.WGS84")
par(mai=c(0,0,0,0),bg='white' )
sco1<-readOGR("./Regional Boundaries Maps and Data/SWCQ1.WGS84//SWC_Q1.dbf", "SWC_Q1")
sco3<-readOGR("./Regional Boundaries Maps and Data/SWCQ4.WGS84//SWC_Q4.dbf", "SWC_Q4")
ices<-readOGR("./Regional Boundaries Maps and Data/ICES_rectangles_statistics/ICES_rectangles_statistics.dbf", "ICES_rectangles_statistics")
ire<-readOGR("./Regional Boundaries Maps and Data/IGFS.WGS84//IGFS.WGS84.dbf", "IGFS.WGS84")
ni<-readOGR("./Regional Boundaries Maps and Data/NI-IBTS.WGS84//NI_IBTS.WGS84.dbf", "NI_IBTS.WGS84")
evhoe<-readOGR("./Regional Boundaries Maps and Data/Fr-EVHOE.WGS84//EVHOE.WGS84.dbf", "EVHOE.WGS84")
rock<-readOGR("./Regional Boundaries Maps and Data/SWC-RockQ3.WGS84//SWC_Q3.dbf", "SWC_Q3")
summary(h$ShootLat)
summary(h$ShootLong)
#plot(contour_200m, add=TRUE, col="lightblue3")
#plot(contour_1000m, add=TRUE, col="lightblue4")
plot(NWW_boundary, border="white", xlim = c(-16,13), ylim = c(36, 62))
plot(europe,add=TRUE, xlim = c(-16, 13), ylim = c(36, 62), asp = 1, col=c('gray81'), 
     border='gray3')
# plot(contour_200m, add=TRUE, border="lightblue3")
# plot(contour_1000m, add=TRUE, border="lightblue4")
plot(ire, add=TRUE, lty=2, lwd=3, border="green")
plot(sco3, add=TRUE, lty=1, lwd=1, border="blue")
plot(evhoe, add=T, lty=1, lwd=1, border="red")
plot(ni, add=T, border="lightblue")
plot(rock, add=T, border="yellow")
dev.new()
plot(NWW_boundary, border="white", xlim = c(-16,13), ylim = c(36, 62))
plot(ices, add=TRUE)
plot(europe,add=TRUE, xlim = c(-16, 13), ylim = c(36, 62), asp = 1, col=c('gray81'), 
     border='gray3')
#text(ices$Centr_X, ices$Centr_Y, ices$ICESNAME)
plot(ospar, col="red", add=T, lwd=4)
check<-subset(hauls, !(hauls$check_StatRec%in%hauls$StatRec))
points(check$ShootLong, check$ShootLat, col="red")
dev.off()
# Station locations don't match because Spain uses Survey Strata - not a problem!
################
# 4.1.2  Depth #
################
hauls$EstDepth<-hauls$Depth
# 767 observations
# All observations with no depth have no haul positional data 
# estimate depth on shoot position is the best we can do
library(marmap)
summary(hauls$ShootLat)
summary(as.numeric(hauls$ShootLong))

# use NOAA website to get bathy map
papoue <- getNOAA.bathy(lon1 = -16, lon2 = 13,
                        lat1 = 36, lat2 = 62, resolution = .5)
summary(papoue)
# make a pretty map of all the stations
png(file="./Diagnostics/Diagnostic_plots/surveydepthmap.png")
blues <- c("lightsteelblue4", "lightsteelblue3",
           "lightsteelblue2", "lightsteelblue1")
greys <- c(grey(0.99), grey(0.95), grey(0.85))
plot(papoue, image = TRUE, land = FALSE, lwd = 0.03,
     bpal = list(c(0, max(papoue), greys),
                 c(min(papoue), 0, blues)))
plot(papoue, n = 1, lwd = 0.04, add = TRUE)
cols<-heat_hcl(13, c = c(80, 30), l=c(30,90), power=c(1/5, 1.5))
hauls$Survey<-(as.factor(hauls$Survey))
points(hauls$ShootLong, hauls$ShootLat, pch=19, 
       cex=0.3, col=cols[hauls$Survey])
dev.off()
# get 
hauls$ShootLong<-as.numeric(hauls$ShootLong)
NOAA_Depth<-get.depth(papoue, x=hauls$ShootLong, y=hauls$ShootLat, locator=FALSE)

hauls$HaulLong[hauls$UniqueID=="BTS-VIIa/2006/3/COR/80/BT4A"]<--5.368430

hauls$EstDepth<-(NOAA_Depth$depth*-1)

hauls$DepthNew<-hauls$Depth
hauls$DepthNew[is.na(hauls$Depth)]<-hauls$EstDepth[is.na(hauls$Depth)] 
summary(hauls$DepthNew)
# make a graph of all the difference between estimated and recorded depths
png(file = "./Diagnostics/Diagnostic_plots/depth_differences-09-29-2016.png", bg = "transparent")
plot(hauls$Depth, hauls$EstDepth, pc=19, xlab="Recorded Depth (m)",
     ylab="Estimated point depth (m)", ylim=c(0,800))
abline(a=0, b=1, col="red")
dev.off()
write.csv(papoue, "./Diagnostics/Diagnostic_plots/Bathy_map_09-29-2016.csv")

# add some diagnostics to check if the differences in depth are acceptable 
png(file = "./Diagnostics/Diagnostic_plots/map_depth_differences.png", bg = "transparent")
plot(papoue, image = TRUE, land = FALSE, lwd = 0.03,
     bpal = list(c(0, max(papoue), greys),
                 c(min(papoue), 0, blues)))
plot(papoue, n = 1, lwd = 0.04, add = TRUE)
cols<-heat_hcl(13, c = c(80, 30), l=c(30,90), power=c(1/5, 1.5))
hauls$Survey<-(as.factor(hauls$Survey))
hauls$Diff_dep<-sqrt((hauls$Depth-hauls$EstDepth)^2)
summary(hauls$Diff_dep)
radius<-sqrt(hauls$Diff_dep/pi)
symbols(hauls$ShootLong, hauls$ShootLat, circles=radius, inches=0.2, fg="white",
        bg=cols[hauls$Survey], xlab="Longitude", ylab="Latitude", add=T)
legend.text<-c(1,10,100,1000, 2000)
x<-c(9,9,9,9,9)
y<-c(50,49.8, 49.5, 48.5,47.5)
symbols(x,y, circles=sqrt(legend.text/pi), inches=0.2, fg="black",
        bg="grey", add=T)
text(9.9, 50, "1m")
text(9.9, 49.8, "10m")
text(9.9, 49.5, "100m")
text(9.9, 48.5, "1000m")
text(9.9, 47.5, "2000m")
dev.off()

png(file = "./Diagnostics/Diagnostic_plots/box_plot_depth_differences_survey_2-8-16.png", bg = "transparent")
plot(hauls$Survey,hauls$Diff_dep, col=cols[hauls$Survey], pch=19)
dev.off()
######################
# 4.1.3 Sweep Length #
######################
summary(as.factor(hauls$SweepLngt))
# Sweep Lenght Values are sometimes incorrect or missing, 
# delete incorrect values 
hauls$EstSweepLngt<-hauls$SweepLngt
hauls$EstSweepLngt[hauls$SweepLngt>121&hauls$Gear=="GOV"]<-"-9"
hauls$EstSweepLngt[is.na(hauls$SweepLngt)]<-"-9"
summary(as.factor(hauls$EstSweepLngt))

# find out extent of issue
sweepsummary<-ddply(hauls, c("Survey","Country", "Year", "Quarter", "EstSweepLngt"),
                    summarise, N=length(EstSweepLngt))
summary(as.factor(hauls$Survey))
# BTS Surveys don't record a sweep - should be NA
# ROT surveys - no sweep - should be NA
# NCT surveys - no sweeps 
hauls$EstSweepLngt[hauls$Survey=="BTS"]<-NA
hauls$EstSweepLngt[hauls$Survey=="BTS-VIIa"]<-NA
hauls$EstSweepLngt[hauls$Survey=="NIGFS"]<-NA
hauls$EstSweepLngt[hauls$Survey=="PT-IBTS"]<-NA

# in 1983/1984 no country recorded sweep, but in 1985+ countries 
# reported using "Recommended Sweeps" so gonna apply the 60/110 rule to these 
# years
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Year=="1983" & hauls$DepthNew<76]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Year=="1983" &hauls$DepthNew>75]<-110
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Year=="1984" &hauls$DepthNew<76]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Year=="1984" & hauls$DepthNew>75]<-110
# Denmark: Apply standard as in Manual
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Country=="DEN" & hauls$DepthNew<76]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Country=="DEN" & hauls$DepthNew>75]<-110
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Year<2005 &
                     hauls$Quarter=="3" & hauls$Country=="DEN"]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Year>2004 &
                     hauls$Quarter=="3" & hauls$Country=="DEN"& 
                     hauls$DepthNew<76]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="3"  
                   & hauls$Country=="DEN" & hauls$Year>2004 &
                     hauls$DepthNew>75]<-110

# Germany next - Q1 and Q 3 rules differ
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"& 
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Country=="GFR" & hauls$DepthNew<76]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Country=="GFR" & hauls$DepthNew>75]<-110
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Year<2004 &
                     hauls$Quarter=="3" & hauls$Country=="GFR"]<-60
#Netherlands next only 2 years missing data in quarter 1
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"&
                     hauls$Quarter=="1" & hauls$Country=="NED"]<-60
#Norway next
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"&
                     hauls$Quarter=="3" & hauls$Country=="NOR"]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Country=="NOR" & hauls$DepthNew<76]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Country=="NOR" & hauls$DepthNew>75]<-110
# Sweden
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"&
                     hauls$Quarter=="3" & hauls$Country=="SWE"]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Country=="SWE" & hauls$DepthNew<76]<-60
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Quarter=="1" &
                     hauls$Country=="SWE" & hauls$DepthNew>75]<-110
# Scotland
hauls$EstSweepLngt[hauls$Survey=="NS-IBTS"&
                     hauls$EstSweepLngt=="-9"&
                     hauls$Quarter=="1" & hauls$Country=="SCO"]<-60

# SWC Survey
hauls$EstSweepLngt[hauls$Survey=="SWC-IBTS"&
                     hauls$EstSweepLngt=="-9"& hauls$Year<2011]<-60

# Rockall
hauls$EstSweepLngt[hauls$Survey=="ROCKALL"&
                     hauls$EstSweepLngt=="-9"& hauls$Year<2011]<-60
# Check all new sweeps
sweepsummary<-ddply(hauls, c("Survey","Country", "Year", "Quarter", "EstSweepLngt"),
                    summarise, N=length(EstSweepLngt))
#Assign cat of short or long to sweep length
summary(as.factor(hauls$EstSweepLngt))
hauls$EstSweepCat<-hauls$EstSweepLngt
hauls$EstSweepCat[hauls$EstSweepLngt<61]<-"short"
hauls$EstSweepCat[hauls$EstSweepLngt==97|hauls$EstSweepLngt==110|
                    hauls$EstSweepLngt==100|hauls$EstSweepLngt==120]<-"long"


sweepcatsummary<-ddply(hauls, c("Survey","Country", "Year", "Quarter", "EstSweepCat"),
                       summarise, N=length(EstSweepCat))
#######################
# 4.1.4 Haul Duration #
#######################
# Check range >66 and <13 not within bounds
summary(hauls$HaulDur)
######################
# 4.1.5 Ground Speed #
######################
### Draw some plots to look at data
#make a plot to look at the current groundspeed*time against distance 
# if perfect we expect an intercept of 0 and a slope of 1
png(file = "./Diagnostics/Diagnostic_plots/distance_speed_time_differences_29-09-2016.png", bg = "transparent")
par(xpd=FALSE)
plot(hauls$GroundSpeed*1852/60*hauls$HaulDur, hauls$Distance, 
     pch=19, col="black", cex=0.5, xlab="Speed X Time", ylab="Distance")
abline(a=0, b=1, col="lightgrey", lwd=2)
dev.off()
png(file="groundspeed_diagnostics.png", bg="transparent")
plot(hauls$HaulDur, hauls$GroundSpeed, pch=19, xlab="Time", ylab="Speed (knots)")
abline(h=6, col="red")
abline(h=2, col="red")
dev.off()
# Change the confidence interval fill color
png(file = "./Diagnostics/Diagnostic_plots/distance_speed_time_differences_with_CI_29-09-2016.png", bg = "transparent")
p1<-ggplot(hauls, aes(x=hauls$GroundSpeed*1852/60*hauls$HaulDur,
                      y=hauls$Distance)) + 
  geom_point(shape=18, color="black")+
  geom_smooth(method=lm,  linetype="dashed",
              color="lightgrey", fill="darkgrey", se=TRUE, fullrange=FALSE, level=.95)
p1 + scale_color_grey()+theme_classic()
dev.off()
# check ground speed
png(file="./Diagnostics/Diagnostic_plots/groundspeed_boxplot_29-09-2016.png", bg="transparent")
plot(hauls$Survey, hauls$GroundSpeed, pch=19, xlab="Survey", 
     ylab="GroundSpeed (knots)", col="lightgrey")
dev.off()
# outliers in Groundspeed found
check<-hauls[hauls$GroundSpeed<3|hauls$GroundSpeed>5, ]
png(file="./Diagnostics/Diagnostic_plots/groundspeed_distance_comparision_29-09-2016.png", bg="transparent")
plot(check$GroundSpeed*check$HaulDur*1852/60, check$Distance, pch=19, col="black")
dev.off()
# In Ns-IBTS 1995 it seems some of the french ground speeds are in the Speed Water Column
# and the Speed water is in the Ground Speed Col.
hauls$GroundSpeed[hauls$UniqueID=="NS-IBTS/1995/1/THA/2/GOV"]<-4
hauls$SpeedWater[hauls$UniqueID=="NS-IBTS/1995/1/THA/2/GOV"]<-2
hauls$GroundSpeed[hauls$UniqueID=="NS-IBTS/1995/1/THA/3/GOV"]<-4
hauls$SpeedWater[hauls$UniqueID=="NS-IBTS/1995/1/THA/3/GOV"]<-2
hauls$GroundSpeed[hauls$UniqueID=="NS-IBTS/1995/1/THA/6/GOV"]<-4
hauls$GroundSpeed[hauls$UniqueID=="NS-IBTS/1995/1/THA/9/GOV"]<-3.8
hauls$SpeedWater[hauls$UniqueID=="NS-IBTS/1995/1/THA/9/GOV"]<-1.9
hauls$GroundSpeed[hauls$UniqueID=="NS-IBTS/1995/1/THA/11/GOV"]<-3.9
hauls$SpeedWater[hauls$UniqueID=="NS-IBTS/1995/1/THA/11/GOV"]<-1.9

# in three cases the ground speed is recorded as zero - change to NA
list<-c('SP_ARSA/1995/1/CDS/23/BAK', 'SP_ARSA/1995/1/CDS/25/BAK',
        'SP_ARSA/1995/1/CDS/30/BAK')

hauls$GroundSpeed[hauls$UniqueID%in%list]<-NA
# really fast ground speed check outlat long distance matches but in others it was way out
#check<-hauls[hauls$GroundSpeed<3|hauls$GroundSpeed>5 


# GROUNDSPEED NOT RECORDED BUT DISTANCE AND DURATION RECORDED
hauls$Realised_groundspeed<-hauls$Distance/hauls$HaulDur/1852*60
summary(hauls$Estimated_groundspeed)
png(file="./Diagnostics/Diagnostic_plots/groundspeed_predictedVdistance_divided_by_time_29-09-2016.png", bg="transparent")
plot(hauls$Realised_groundspeed, hauls$GroundSpeed, 
     pch=19, col="lightgrey",
     xlim=c(1,6), ylim=c(1,6))
abline(0,1, col="black")
dev.off()
png(file = "./Diagnostics/Diagnostic_plots/RecordedVRealisedGS_29-09-2016.png", bg = "transparent")
plot(hauls$GroundSpeed, hauls$Distance/hauls$HaulDur/1852*60,
     pch=19, col="black", cex=1, xlab="Recorded Groundspeed (knots)", 
     ylab="Realised Groundspeed (knots)", xlim=c(0,9), ylim=c(0,9))
abline(a=0, b=1, col="lightgrey", lwd=2)
draw.circle(4,7.5, 1.2, border="red", lwd=2)
abline(h=6, col="lightgrey", lwd=1, lty=2)
abline(v=6, col="lightgrey", lwd=1, lty=2)
abline(h=2, col="lightgrey", lwd=1, lty=2)
abline(h=1, col="lightgrey", lwd=1, lty=2)
draw.circle(8.4, 8.4, 0.1, border="red", lty=4, lwd=2)
dev.off()
# no surprises here!
# using model with r squared of ~0.74 and good AIC score, which makes sense in 
# mechanical terms
gs_model1<-lm(GroundSpeed~Quarter:Ship:Gear, data=hauls)
# pretty decent model here - problem is it won't work for all ships as I have some 
# ships with no data at all on Ground speed to inform model
summary(hauls$GroundSpeed)
needSpeed <- hauls[is.na(hauls$GroundSpeed),]
needSpeedShip <- unique(needSpeed$Ship)
withSpeedShip <- unique(hauls$Ship[!is.na(hauls$GroundSpeed)])
needSpeedGear<-unique(needSpeed$Gear)
# ID ships that have some missing GroundSpeed records
canSpeedShip <- needSpeedShip[needSpeedShip %in% withSpeedShip]
canSpeed <- hauls[hauls$Ship %in% canSpeedShip, ]
# Split the data into the two types: only a few missing and completely missing
canSpeedNA <- canSpeed[is.na(canSpeed$GroundSpeed) & is.na(canSpeed$Distance),]
canSpeedOK <- canSpeed[!is.na(canSpeed$GroundSpeed),]

gs_model2<-lm(GroundSpeed~Quarter:Gear, data=hauls)

AIC(gs_model1, gs_model2)
anova(gs_model1,gs_model2)
#predictedGS<-predict(gs_model2, hauls, allow.new.levels=F)
# allmissing observations
canSpeed<-subset(canSpeed, !canSpeed$Gear=="ROT",)
canSpeed$predicted_groundspeed_gs1<-predict(gs_model1, canSpeed , allow.new.levels=T)
summary(canSpeed$predicted_groundspeed_gs1)
plot(canSpeed$predicted_groundspeed_gs1,canSpeed$GroundSpeed, pch=19, col='grey' )
gs5_dat<-subset(hauls, !hauls$Gear=="ROT",)
gs5_dat$predicted_groundspeed_gs2<-predict(gs_model2, gs5_dat , allow.new.levels=T)
summary(gs5_dat$predicted_groundspeed_gs2)
summary(canSpeed$predicted_groundspeed_gs1)
# attach predictions to the hauls dataset
list<-canSpeed$UniqueID

hauls$predicted_groundspeed_gs1[hauls$UniqueID%in%list]<-canSpeed$predicted_groundspeed_gs1[hauls$UniqueID%in%list]
list<-gs5_dat$UniqueID
hauls$predicted_groundspeed_gs2[hauls$UniqueID%in%list]<-gs5_dat$predicted_groundspeed_gs2[hauls$UniqueID%in%list]

# If real data available use that
hauls$GroundSpeed_Used<-hauls$GroundSpeed
hauls$GroundSpeed_Used[hauls$GroundSpeed_Used<2]<-NA
hauls$GroundSpeed_Used[hauls$GroundSpeed_Used>6]<-NA
hauls$GroundSpeed_Used[hauls$UniqueID=="IE-IGFS/2015/4/CEXP/100/GOV"]<-NA
hauls$GroundSpeed_Quality_Code[!is.na(hauls$GroundSpeed_Used)]<-"Recorded_Groundspeed"
# If gear is ROT then no data ever available
hauls$GroundSpeed_Quality_Code[hauls$Gear=="ROT"]<-"Manual_Speed"
hauls$GroundSpeed_Used[hauls$Gear=="ROT"]<-4
# If ground speed available for Ship and Gear - use that
hauls$GroundSpeed_Quality_Code[is.na(hauls$GroundSpeed_Used)]<-"model_1"
hauls$GroundSpeed_Used[is.na(hauls$GroundSpeed_Used)]<-hauls$predicted_groundspeed_gs1[is.na(hauls$GroundSpeed_Used)]
# if ground speed available for Gear
summary(hauls$predicted_groundspeed_gs2)
hauls$GroundSpeed_Quality_Code[is.na(hauls$GroundSpeed_Used)]<-"model_2"
hauls$GroundSpeed_Used[is.na(hauls$GroundSpeed_Used)]<-hauls$predicted_groundspeed_gs2[is.na(hauls$GroundSpeed_Used)]
# If no model fits use Manual speed
hauls$GroundSpeed_Quality_Code[is.na(hauls$GroundSpeed_Used)]<-"Manual_Speed"
hauls$GroundSpeed_Used[is.na(hauls$GroundSpeed_Used)]<-4

summary(hauls$GroundSpeed_Used)
summary(as.factor(hauls$GroundSpeed_Quality_Code))

hauls[, GroundSpeed_meters_per_min := GroundSpeed_Used * 1852 / 60]
summary(hauls$GroundSpeed_meters_per_min)
summary(hauls$GroundSpeed_Used[hauls$Country=="SPA"])
# Spain within expected bounds
summary(hauls$GroundSpeed_Used[hauls$Country=="POR"])
# Portugal within expected bounds
summary(hauls$GroundSpeed_Used[hauls$Gear=="BT7"])
summary(hauls$GroundSpeed_Used[hauls$Gear=="BT8"])
summary(hauls$GroundSpeed_Used[hauls$Gear=="BT4A"])
summary(hauls$GroundSpeed_Used[hauls$Gear=="GOV"])
summary(hauls$GroundSpeed_Used[hauls$Gear=="ROT"])
#get equation for gs1
formula(gs_model1)
summary(gs_model1)
coeffs=(coefficients(gs_model1))
formula(gs_model2)
coeffs=(coefficients(gs_model2))
# draw a graph showing predicted v actual ground speeds.
png(file="groundspeed_predictedVactual_29-09-2016.png", bg="transparent")
predict<-predict(gs_model1)
gs<-subset(hauls, !is.na(hauls$GroundSpeed),)
plot(predict, gs$GroundSpeed, xlab="Predicted GroundSpeed (knots)",
     ylab="Recorded Groundspeed (knots)", pch=19)
predict1<-predict(gs_model2)
points(predict1, gs$GroundSpeed, pch=19, col="lightgrey")
abline(0,1, col="red")
dev.off()
########################
# 4.1.6 Towed Distance #
########################
## Calculate haversine distance with shoot and haul coordinates ##
hauls[, LatLongDistance := earth.dist(long1 = ShootLong,
                                      lat1 = ShootLat,
                                      long2 = HaulLong,
                                      lat2 = HaulLat) * 1000]

## RAW DISTANCE ##
hauls[!is.na(Distance), c("newDist", "qualityDistance") :=
        list(Distance, "rawDistance")]

## HAVERSINE DISTANCE ##
# if haul and shoot coordinates are the same (i.e., equal to zero) then leave as NA
# Also ingore hauls with funny implied speed.
hauls[is.na(Distance) & !is.na(LatLongDistance) & LatLongDistance > 1 &
        LatLongDistance/HaulDur>61.73 & LatLongDistance/HaulDur<185.2 ,
      c("newDist", "qualityDistance") :=
        list(LatLongDistance, "LatLongDistance")]

## DURATION X SPEED ##
# HaulDur x GroundSpeed
hauls[, SpeedTimeDist := hauls$GroundSpeed_meters_per_min*HaulDur]

hauls[ !is.na(SpeedTimeDist) &is.na(newDist)&
         SpeedTimeDist/HaulDur>61.73 & SpeedTimeDist/HaulDur<185.2,
       c("newDist", "qualityDistance") :=
         list( SpeedTimeDist, "SpeedHaulDur")]

# Hauls that don't have raw distance, shoot/haul coordinates, or GroundSpeed can be estimated
# using linear models to predict GroundSpeed. Two different types of missing data are found:
# 1) ships that have no GroundSpeed records and we need to estimate from other ships, and
# 2) ships that have only a few missing GroundSpeed records and we can use ship as a factor in the lm
# Model for estimating Ground Speed has been reviewed based on feedback 
# from the IBTS working Group (V.T)
summary(as.numeric(hauls$newDist))

# check new distances relationships
png(file="distance_speed_time_differences.png", bg = "transparent")
par(xpd=FALSE)
plot(hauls$SpeedTimeDist, hauls$newDist, 
     pch=19, col="black", cex=0.5, xlab="Speed X Time", ylab="Distance")
abline(a=0, b=1, col="lightgrey", lwd=2)
dev.off()
# Check distances within bounds
png(file="distance_speed_time_differences_bounds.png", bg = "transparent")
par(xpd=FALSE)
plot(hauls$SpeedTimeDist, hauls$newDist, 
     pch=19, col="black", cex=0.5, xlab="Speed X Time", ylab="Distance")
abline(a=0, b=1, col="lightgrey", lwd=2)
abline(a=0, b=.5, col="red", lwd=2, lty=2)
abline(a=0, b=1.5, col="red", lwd=2, lty=2)
dev.off()
############################

plot(hauls$HaulDur, hauls$newDist, pch=19, cex=0.5, xlab="Time", ylab="Distance")
# perfect speed - 4 knots, bounds 2- 6 knots
x<-c(13:66)
points(x, x*4*1852/60, type="l", col="red", lwd=3)
points(x, x*2*1852/60, type="l", col="red", lty=2, lwd=2)
points(x, x*6*1852/60, type="l", col="red", lty=2, lwd=2)
# Set up extra check col
hauls$manual_speed_distance<-hauls$HaulDur*4*1852/60
# distance checks required as some of them are really odd.
hauls$LatLongDistance[hauls$LatLongDistance==0]<-NA
outside_bounds<-subset(hauls, hauls$newDist/hauls$manual_speed_distance>1.50|
                         hauls$newDist/hauls$manual_speed_distance<.5, )

points(outside_bounds$HaulDur,outside_bounds$newDist, col="red", pch=19)
# only 127 to check :)
# if haversine is available use that
list<-outside_bounds$UniqueID
hauls$newDist[hauls$UniqueID%in%list]<-hauls$LatLongDistance[hauls$UniqueID%in%list]
hauls$qualityDistance[hauls$UniqueID%in%list]<-"LatLongDistance"
# recheck 
outside_bounds<-subset(hauls, hauls$newDist/hauls$manual_speed_distance>1.50|
                         hauls$newDist/hauls$manual_speed_distance<.5, )

points(outside_bounds$HaulDur,outside_bounds$newDist, col="red", pch=19)
# still not gone, use speed*time
list<-outside_bounds$UniqueID
hauls$newDist[hauls$UniqueID%in%list]<-hauls$SpeedTimeDist[hauls$UniqueID%in%list]
hauls$qualityDistance[hauls$UniqueID%in%list]<-"SpeedHaulDur"
# recheck # all within bounds now
# next refine estimates
# is distance used within 20% of lat long distance 
check__dist_haversine_match<-subset(hauls, hauls$newDist/hauls$LatLongDistance>1.2|
                                      hauls$newDist/hauls$LatLongDistance<.8,)
points(check_haversine_match$HaulDur,check_haversine_match$newDist, col="red", pch=19)
# so 1433 points more than 20% away - all rest are fine
check_dist_speed_match<-subset(check__dist_haversine_match, 
                               check__dist_haversine_match$newDist/check__dist_haversine_match$SpeedTimeDist>1.2|
                                 check__dist_haversine_match$newDist/check__dist_haversine_match$SpeedTimeDist<.8,)
# 121 not okay with speed
points(check_speed_match$HaulDur,check_speed_match$newDist, col="yellow", pch=19)

check_lat_speed_match<-subset(check_dist_speed_match, 
                              check_dist_speed_match$LatLongDistance/check_dist_speed_match$SpeedTimeDist<1.2&
                                check_dist_speed_match$LatLongDistance/check_dist_speed_match$SpeedTimeDist>.8,)
# so 45 of the haversine and speed X time are within 20% - use the lat long rather than the 
# recorded distance
list<-check_lat_speed_match$UniqueID
hauls$newDist[hauls$UniqueID%in%list]<-hauls$LatLongDistance[hauls$UniqueID%in%list]
hauls$qualityDistance[hauls$UniqueID%in%list]<-"LatLongDistance"
# So whats left
check_no_match<-subset(check_dist_speed_match, 
                       check_dist_speed_match$LatLongDistance/check_dist_speed_match$SpeedTimeDist>1.2|
                         check_dist_speed_match$LatLongDistance/check_dist_speed_match$SpeedTimeDist<.8,)

points(check_no_match$HaulDur,check_no_match$newDist, col="green", pch=19)
# Does value lie within +/- 25% of Man Speed
check_within_bounds<-subset(check_no_match, 
                            check_no_match$newDist/check_no_match$manual_speed_distance>1.25|
                              check_no_match$newDist/check_no_match$manual_speed_distance<.75, )
# check all distances available?
summary(hauls$newDist)
# problem with some lat long distances not available
# Use manual speed 
hauls$qualityDistance[is.na(hauls$newDist)]<-"SpeedHaulDur"
hauls$newDist[is.na(hauls$newDist)]<-hauls$SpeedTimeDist[is.na(hauls$newDist)]
# all vaules within accepted bounds.
png(file="distances_cleaned_29-09-2016.png", bg = "transparent")
plot(hauls$HaulDur, hauls$newDist, pch=19, cex=0.5, xlab="Time", ylab="Distance")
# perfect speed - 4 knots, bounds 2- 6 knots
x<-c(13:66)
points(x, x*4*1852/60, type="l", col="red", lwd=3)
points(x, x*2*1852/60, type="l", col="red", lty=2, lwd=2)
points(x, x*6*1852/60, type="l", col="red", lty=2, lwd=2)
dev.off()
# all distances in newDist are withing acceptable ranges the best estimate is applied in
# each situation
write.csv(hauls, "Working_hauls_file_29_09_2016.csv")
# remove all the intermediate files 
summary(as.factor(hauls$qualityDistance))
#############################################
# 4.1.7 Otter Trawl Net Geometry Parameters #
#############################################
#######################
# 4.1.7.1 Wing Spread #
#######################
###########################################
# 4.1.7.1.1 The GOV Otter Trawl (Model 1) #
###########################################
# remove outlier from irish data
summary(hauls$WingSpread[hauls$Country=="IRL"])
# 38 too big
hauls$WingSpread[hauls$Country=="IRL"&hauls$WingSpread==38]<-NA
#Centered data for model
str(hauls)
# Add Sweep Cats - See Est Sweep Cats above.
hauls[!is.na(Year), "Yearfac":=
        as.factor(Year)]
hauls[!is.na(Quarter), "Qfac":=
        as.factor(Quarter)]
hauls[!is.na(Ship), "Shipfac":=
        as.factor(Ship)]
summary(as.factor(hauls$EstSweepCat[hauls$Gear=="GOV"]))
options(show.signif.stars = T)
library(lmerTest)
#Center data to give model stability 
hauls[!is.na(DepthNew), c("meanDepth") :=
        mean(DepthNew)]
hauls[!is.na(WingSpread), c("meanWingSpread") :=
        mean(WingSpread)]
hauls[!is.na(Netopening), c("meanNetopening") :=
        mean(Netopening)]
hauls[!is.na(DoorSpread), c("meanDoorSpread"):=
        mean(DoorSpread)]
hauls[!is.na(Depth), c("DepthCenter") :=
        Depth-meanDepth]
hauls[!is.na(DepthNew), c("LogDepthCenter") :=
        log(DepthNew)-log(meanDepth)]
hauls[!is.na(WingSpread), c("WingSpreadCenter") :=
        WingSpread-meanWingSpread]
hauls[!is.na(DoorSpread), c("DoorSpreadCenter") :=
        DoorSpread-meanDoorSpread]
hauls[!is.na(WingSpread), c("LogWingSpreadCenter") :=
        log(WingSpread)-log(meanWingSpread)]
hauls[!is.na(DoorSpread), c("LogDoorSpreadCenter") :=
        log(DoorSpread)-log(meanDoorSpread)]
hauls[!is.na(Netopening), c("NetopeningCenter") :=
        Netopening-meanNetopening]
hauls[!is.na(Netopening), c("LogNetopeningCenter") :=
        log(Netopening)-log(meanNetopening)]
hauls$SweepCat<-as.factor(hauls$SweepLngt)
summary(as.factor(hauls$Ship))

train<-subset(hauls, Gear=="GOV"& (!is.na(WingSpread)) & (!is.na(DoorSpread))
              &(!is.na(Netopening)),)

ws_model1<- lmer(log(WingSpread) ~ LogDepthCenter:EstSweepCat
           + (1|Ship:EstSweepCat),
           data=train, REML=FALSE) 

r.squaredGLMM(ws_model1)
summary(train$Ship)
cols<-rainbow(6)

data<-predict(ws_model1)
summary(as.factor(train$Survey))
summary(train$Survey)
cols<-rainbow(13)
png(file="GOV_Wingspreads_model1_29-09-2016.png", bg="transparent")
plot(train$DepthNew, train$WingSpread, col=cols[as.factor(train$Survey)], 
     pch=15, xlab="Depth (m)", ylab="WingSpread(m)")
points(train$DepthNew, exp(data), pch=21, col=cols[as.factor(train$Survey)],
       bg="black")
lines(lowess(train$WingSpread~train$DepthNew), col="black", lwd=2)
lines(lowess(exp(data)~train$DepthNew), col="red", lwd=3, lty=2)
legend(300, 40, levels(as.factor(hauls$Survey[hauls$Gear=="GOV"])), 
       col=cols, pch=15, ncol=3, cex=1, bty="o")
dev.off()
# set up user data using selected model
# subset all GOV gear
hauls$EstSweepCat<-as.factor(hauls$EstSweepCat)
hauls$Ship<-as.factor(hauls$Ship)
the_gov<-subset(hauls, Gear=="GOV")
summary(the_gov$WingSpread)
summary(the_gov$LogDoorSpreadCenter)
str(the_gov)
# 30442 obs
the_gov$mod1_wingspread_gov<-exp(predict(ws_model1, the_gov, allow.new.levels=T))
summary(the_gov$mod1_wingspread_gov)
# If real values are available use these
hauls$Use_WingSpread[!is.na(hauls$WingSpread)&
                       hauls$Gear=="GOV"]<-hauls$WingSpread[!is.na(hauls$WingSpread)&
                                                              hauls$Gear=="GOV"]
hauls$QualityWing[!is.na(hauls$WingSpread)]<-"raw_wingspread"

list<-the_gov$UniqueID
hauls$mod1_wingspread_gov[hauls$UniqueID%in%list]<-the_gov$mod1_wingspread_gov[the_gov$UniqueID%in%list]

hauls$Use_WingSpread[is.na(hauls$WingSpread)&
                       hauls$Gear=="GOV"]<-hauls$mod1_wingspread_gov[is.na(hauls$WingSpread)&
                                                                       hauls$Gear=="GOV"]
hauls$QualityWing[is.na(hauls$WingSpread)&!is.na(hauls$mod1_wingspread_gov)&
                    hauls$Gear=="GOV"]<-"mod1_wing_gov"
#check wing speads 
summary((hauls$Use_WingSpread[hauls$Gear=="GOV"]))
summary(hauls$Use_WingSpread)
# get model outputs
formula(ws_model1)
summary(ws_model1)
coeffs=(coefficients(ws_model1))
#####################################
# 4.1.7.1.2 The ROT Trawl (Model 2) #
#####################################
summary(hauls$WingSpread[hauls$Gear=="ROT"])
# 2333 estimated values needed
# Matt sent me some trail wingspreads to help to model these data
# but data now available in DATRAS file 20 values available
# lets look at these data first
png(file = "wingspread_ROT_29-09-2016.png", bg = "transparent")
plot(hauls$DepthNew[hauls$Gear=="ROT"],hauls$WingSpread[hauls$Gear=="ROT"],
     pch=19, xlab="Depth (m)",
     ylab="Wingspread (m)" )
abline(lm(hauls$WingSpread[!is.na(hauls$WingSpread)&hauls$Gear=="ROT"]~
            hauls$DepthNew[!is.na(hauls$WingSpread)&hauls$Gear=="ROT"]),
       col="lightgrey")
dev.off()
# given how data poor the situation is this must be kept very simple
ws_dat<-subset(hauls, !is.na(hauls$WingSpread) & hauls$Gear=="ROT", )
# need doorspread first
# Great we can keep this really simple and have a really strong
# Adjusted R sqd of 0.952
# has the best AIC score (-153.47.6)
hauls$mod2_wingspread_rot[hauls$Gear=="ROT"]<-exp(0.3798356+0.6489731*log(hauls$Use_DoorSpread[hauls$Gear=="ROT"]))
# set up user values for doorspread
#RAW WINGSPREAD
hauls$Use_WingSpread[!is.na(hauls$WingSpread)&hauls$Gear=="ROT"]<-hauls$WingSpread[!is.na(hauls$WingSpread)&hauls$Gear=="ROT"]
hauls$QualityWing[!is.na(hauls$WingSpread)&hauls$Gear=="ROT"]<-"raw_wingspread"
hauls$Use_WingSpread[is.na(hauls$WingSpread)&hauls$Gear=="ROT"]<-hauls$mod2_wingspread_rot[is.na(hauls$WingSpread)&hauls$Gear=="ROT"]
hauls$QualityWing[is.na(hauls$WingSpread)&hauls$Gear=="ROT"]<-"model_wingspread_rot"

summary(hauls$mod2_wingspread_rot[hauls$Gear=="ROT"])
summary(hauls$Use_WingSpread[hauls$Gear=="ROT"])
#########################################
# 4.1.7.1.3 The BAK Trawl (Model 3 & 4) #
#########################################
# set up data set for model selection
spain<-hauls[!is.na(hauls$DoorSpread) & !is.na(hauls$WingSpread)
              & !is.na(hauls$Netopening)&hauls$Country=="SPA", ]
plot(spain$WingSpread, spain$DoorSpread, pch=19, col=cols[as.factor(spain$Gear)])
plot(spain$DepthNew, spain$WingSpread, pch=19,col=cols[as.factor(spain$Gear)])
# 80 observations in dataset.
door_dat<-hauls[!is.na(spain$DoorSpread),]
wing_dat<-spain[!is.na(spain$WingSpread),]
summary(wing_dat$DoorSpread)

model3<-lm(log(WingSpread) ~ LogDoorSpreadCenter:SweepCat,data=wing_dat)
summary(model3)
predict<-exp(predict(model3))
col1<-c( "darkred","darkgreen")
points(wing_dat$DepthNew, predict, col=col1[as.factor(wing_dat$Gear)], pch=19)
# first I need all the centered and logged data for hauls
str(hauls)
#Get Means
hauls$meanDepth<-mean(hauls$DepthNew)
mean(hauls$WingSpread[!is.na(hauls$WingSpread)])
hauls$meanWingSpread[!is.na(hauls$WingSpread)]<-13.82127
mean(hauls$Netopening[!is.na(hauls$Netopening)])
hauls$meanNetopening[!is.na(hauls$Netopening)]<-3.526697
mean(hauls$DoorSpread[!is.na(hauls$DoorSpread)])
hauls$meanDoorSpread[!is.na(hauls$DoorSpread)]<-59.74916
# Get Centered data
hauls$DepthCenter<-hauls$DepthNew-hauls$meanDepth
hauls$WingSpreadCenter<-hauls$WingSpread-hauls$meanWingSpread
hauls$DoorSpreadCenter<-hauls$DoorSpread-hauls$meanDoorSpread
hauls$LogDoorSpreadCenter<-hauls$Netopening-hauls$meanNetopening
# Log and Center the Data
hauls$LogDepthCenter<-log(hauls$DepthNew)-log(hauls$meanDepth)
hauls$LogWingSpreadCenter<-log(hauls$WingSpread)-log(hauls$meanWingSpread)
hauls$LogDoorSpreadCenter<-log(hauls$DoorSpread)-log(hauls$meanDoorSpread)
hauls$LogNetopeningCenter<-log(hauls$Netopening)-log(hauls$meanNetopening)
hauls$SweepCat<-as.factor(hauls$SweepLngt)
summary(as.factor(hauls$Ship))

# model3: model3<-lm(log(WingSpread) ~ LogDoorSpreadCenter:SweepCat,data=alldat)
wing_dat<-subset(hauls, Country=="SPA"& !is.na(DoorSpread),)
model3<-lm(log(WingSpread) ~ LogDoorSpreadCenter:SweepCat,
           data=wing_dat)
summary(model3)
wing_dat_len <- nrow(wing_dat)
wing_dat$predict_lm_wing_dat<-exp(predict(model3, wing_dat, allow.new.levels=T))
summary(wing_dat$predict_lm_wing_dat)
plot(wing_dat$DepthNew,wing_dat$predict_lm_wing_dat,
     pch=19, col=cols[as.factor(wing_dat$Gear)])
# model2: fm2<-lm(log(WingSpread) ~ LogDepthCenter:SweepCat:Gear,data=alldat)
wing_dat2<-subset(hauls, Country=="SPA",)
model4<-lm(log(WingSpread) ~ LogDepthCenter:SweepCat,data=wing_dat2)

summary(model4)
wing_dat_len <- nrow(wing_dat2)
wing_dat2$predict_lm_wing_dat<-exp(predict(model4, wing_dat2, allow.new.levels=T))
summary(wing_dat2$predict_lm_wing_dat)
plot(wing_dat2$DepthNew,wing_dat2$predict_lm_wing_dat,
     pch=19, col=cols[as.factor(wing_dat2$Gear)])
list<-wing_dat2$UniqueID
formula(model4)
summary(model4)
coeffs=(coefficients(model4))
# predict results for spains wing spread data
hauls$Use_WingSpread[!is.na(hauls$WingSpread)&hauls$Country=="SPA"]<-hauls$WingSpread[!is.na(hauls$WingSpread)&hauls$Country=="SPA"]
hauls$QualityWing[!is.na(hauls$WingSpread)&hauls$Country=="SPA"]<-"raw_wingspread"

list<-wing_dat$UniqueID
hauls$mod1_wingspread_spa[hauls$UniqueID%in%list]<-wing_dat$predict_lm_wing_dat[wing_dat$UniqueID%in%list]
list<-wing_dat2$UniqueID
hauls$mod2_wingspread_spa[hauls$Country=="SPA"]<-wing_dat2$predict_lm_wing_dat[wing_dat$UniqueID%in%list]

hauls$Use_WingSpread[is.na(hauls$WingSpread)&hauls$Country=="SPA"]<-hauls$mod1_wingspread_spa[is.na(hauls$WingSpread)&hauls$Country=="SPA"]
hauls$QualityWing[is.na(hauls$WingSpread)&!is.na(hauls$mod1_wingspread_spa)&
                    hauls$Country=="SPA"]<-"mod3_wingspread_spa"

hauls$Use_WingSpread[is.na(hauls$Use_WingSpread)&
                       hauls$Country=="SPA"]<-hauls$mod2_wingspread_spa[is.na(hauls$Use_WingSpread)&hauls$Country=="SPA"]
hauls$QualityWing[is.na(hauls$QualityWing)&
                    hauls$Country=="SPA"]<-"mod4_wingspread_spa"
summary(hauls$Use_WingSpread[hauls$Country=="SPA"])
summary(as.factor(hauls$QualityWing[hauls$Country=="SPA"]))
png(file="WingSpread_data_spain_col_29-09-2016.png", bg="transparent")
cols<- rainbow(3)
plot(hauls$DepthNew[hauls$Country=="SPA"], hauls$Use_WingSpread[hauls$Country=="SPA"],
     col=cols[as.factor(hauls$QualityWing[hauls$Country=="SPA"])], pch=19,
     xlab="Depth (m)", ylab="WingSpread (m)")

legend(5, 35, levels(as.factor(hauls$QualityWing[hauls$Country=="SPA"])), 
       col=cols, 
       pch=15, ncol=1, cex=1, bty="n")
dev.off()
##########################
# 4.1.7.1.4 The NCT Gear #
##########################
# WingSpread = 15.1
hauls$WingSpread[hauls$Gear=="NCT"]<-15.1
hauls$Use_WingSpread[hauls$Gear=="NCT"]<-15.1
hauls$QualityWing[hauls$Gear=="NCT"]<-"mean_wingspread"
#######################
# 4.1.8.1 Door Spread #
#######################
###########################################
# 4.1.8.1.1 The GOV Otter Trawl (Model 1) #
###########################################
# for model election set up training data set
train<-subset(hauls, Gear=="GOV"& (!is.na(WingSpread)) & (!is.na(DoorSpread))
              &(!is.na(Netopening)),)

model1_ds<- lmer(log(DoorSpread) ~ LogDepthCenter:EstSweepCat
           + (1|Ship:EstSweepCat), 
           data=train, REML=FALSE)
summary(model1_ds)
r.squaredGLMM(model1_ds)
cols<-rainbow(13)
data<-predict(model1_ds)
summary(as.factor(train$Survey))
png(file="GOV_Doorspreads_model1.png", bg="transparent")
plot(train$DepthNew, train$DoorSpread, col=cols[as.factor(train$Survey)], 
     pch=15, xlab="Depth (m)", ylab="WingSpread(m)")
points(train$DepthNew, exp(data), pch=21, col=cols[as.factor(train$Survey)],
       bg="black")
lines(lowess(train$DoorSpread~train$DepthNew), col="black", lwd=2)
lines(lowess(exp(data)~train$DepthNew), col="lightgrey", lwd=3, lty=2)
legend(530, 80, levels(as.factor(hauls$Survey[hauls$Gear=="GOV"])), 
       col=cols, pch=15, ncol=2, cex=.9, bty="o")
dev.off()
# note - less compex limear model can't explain as much variance as mixed model
# the random effects are accounting for quite a lot of variance
# set up user data using selected mode # subset all GOV gear
the_gov<-subset(hauls, Gear=="GOV")
# 30538 obs
the_gov$mod1_doorspread_gov<-exp(predict(model1_ds, the_gov, allow.new.levels=T))
summary(the_gov$mod1_doorspread_gov)
# If real values are available use these
# predict results for spains net opening data
hauls$Use_DoorSpread[!is.na(hauls$DoorSpread)&
                       hauls$Gear=="GOV"]<-hauls$DoorSpread[!is.na(hauls$DoorSpread)&
                                                              hauls$Gear=="GOV"]
hauls$QualityDoor[!is.na(hauls$DoorSpread)]<-"raw_doorspread"

list<-the_gov$UniqueID
hauls$mod1_doorspread_gov[hauls$UniqueID%in%list]<-the_gov$mod1_doorspread_gov[the_gov$UniqueID%in%list]

hauls$Use_DoorSpread[is.na(hauls$DoorSpread)&
                       hauls$Gear=="GOV"]<-hauls$mod1_doorspread_gov[is.na(hauls$DoorSpread)&
                                                                       hauls$Gear=="GOV"]
hauls$QualityDoor[is.na(hauls$DoorSpread)&!is.na(hauls$mod1_doorspread_gov)&
                    hauls$Gear=="GOV"]<-"mod1_door_gov"
#check door spreads
summary((hauls$Use_DoorSpread[hauls$Gear=="GOV"]))
summary(as.factor(hauls$QualityDoor[hauls$Gear=="GOV"]))
summary(hauls$Use_DoorSpread)
summary(model1_ds)
coeffs=coefficients(model1_ds);coeffs
#####################################
# 4.1.8.1.2 The ROT Trawl (Model 2) #
#####################################
summary(hauls$DoorSpread[hauls$Gear=="ROT"])
# Doorspread can be sorted first
# DoorSpread only has 955 missing values to be estimated
png(file = "doorspread_ROT_29-09-2016.png", bg = "transparent")
plot(hauls$Depth[hauls$Gear=="ROT"], hauls$DoorSpread[hauls$Gear=="ROT"], 
     pch=19, xlab="Depth (m)",
     ylab="Door Spread (m)")
dev.off()
x<-hauls$Depth[hauls$Gear=="ROT"]
y<-hauls$DoorSpread[hauls$Gear=="ROT"]

plot(y~x,type="n")
m = lm(y~x)
wx = par("usr")[1:2]
new.x = seq(wx[1],wx[2],len=100)
pred = predict(m, new=data.frame(x=new.x), interval="conf", level=.95)
lines(new.x,pred[,"fit"],lwd=2)
lines(new.x,pred[,"lwr"],lty=3)
lines(new.x,pred[,"upr"],lty=3)
points(x,y,pch=16,col="steelblue")
# raw data looks good - no worrying outliers
corrhaul_rot<-subset(hauls, Gear=="ROT",
                     select=c(Year, Depth, Distance,
                              DoorSpread))
summary(corrhaul_rot)
require(corrgram)
png(file = "corrhaul_ROT_29-09-2016.png", bg = "transparent")
corrgram(corrhaul_rot, order="PCA", lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Hauls Data NI") 
dev.off()
# the only variable available to estimate doorspread is depth for ROT ship
# no sweep, no changes in gear , no wing or netopening
model2<-lm(log(DoorSpread[hauls$Gear=="ROT"])~log(Depth[hauls$Gear=="ROT"]), data=hauls)
AIC(model2)
summary(model2)
coeff=coefficients(model2); coeff
png(file = "logged_doorspread_ROT_29-09-2016.png", bg = "transparent")
plot(log(hauls$DepthNew[hauls$Gear=="ROT"]), log(hauls$DoorSpread[hauls$Gear=="ROT"]), pch=19, xlab="logged Depth (m)",
     ylab="logged Doorspread (m)")
abline(a=(2.558093), b=(0.261106) , col="red", lwd=2)
dev.off()

hauls$mod_doorspread_rot[hauls$Gear=="ROT"]<-exp(coeff[1]+coeff[2]*log(hauls$DepthNew[hauls$Gear=="ROT"]))
# set up user values for doorspread
hauls$Use_DoorSpread[!is.na(hauls$DoorSpread)&
                       hauls$Gear=="ROT"]<-hauls$DoorSpread[!is.na(hauls$DoorSpread)&
                                                              hauls$Gear=="ROT"]
hauls$QualityDoor[!is.na(hauls$DoorSpread)&hauls$Gear=="ROT"]<-"raw_doorspread"
hauls$Use_DoorSpread[is.na(hauls$DoorSpread)&hauls$Gear=="ROT"]<-hauls$mod_doorspread_rot[is.na(hauls$DoorSpread)&hauls$Gear=="ROT"]
hauls$QualityDoor[is.na(hauls$DoorSpread)&hauls$Gear=="ROT"]<-"model_doorspread_rot"

summary(hauls$mod_doorspread_rot[hauls$Gear=="ROT"])
summary(hauls$Use_DoorSpread[hauls$Gear=="ROT"])
summary(hauls$Use_DoorSpread)
summary(as.factor(hauls$QualityDoor))
#########################################
# 4.1.8.1.3 The BAK Trawl (Model 3 & 4) #
#########################################
# 2 models selected - model 3 and model 4
# model3: dm3<-lm(log(DoorSpread) ~ LogWingSpreadCenter:SweepCat:Gear,data=spain)
door_dat<-subset(hauls, Country=="SPA"& !is.na(WingSpread),)
dm3<-lm(log(DoorSpread) ~ LogWingSpreadCenter:SweepCat,data=door_dat)
summary(dm3)
door_dat_len <- nrow(door_dat)
door_dat$predict_lm_door_dat<-exp(predict(dm3, door_dat, allow.new.levels=T))
plot(door_dat$DepthNew,(door_dat$predict_lm_door_dat),
     pch=19, col=cols[as.factor(door_dat$Gear)])
# model2: dm4<-lm(log(DoorSpread) ~ LogDepthCenter:SweepCat:Gear,data=spain)
door_dat2<-subset(hauls, Country=="SPA",)
dm4<-lm(log(DoorSpread) ~ LogDepthCenter:SweepCat, data=door_dat2)
summary(dm4)
door_dat_len <- nrow(door_dat2)
door_dat2$predict_lm_door_dat<-exp(predict(dm4, door_dat2, allow.new.levels=T))
summary(door_dat2$predict_lm_door_dat)
plot(door_dat2$DepthNew,door_dat2$predict_lm_door_dat,
     pch=19, col=cols[as.factor(door_dat2$Gear)])
formula(dm4)
summary(dm4)
coeffs=(coefficients(dm4))
coeffs
# predict results for spains door spread data
hauls$Use_DoorSpread<-hauls$DoorSpread
hauls$QualityDoor[!is.na(hauls$DoorSpread)]<-"raw_doorspread"

list<-door_dat2$UniqueID
hauls$mod2_doorspread_spa[hauls$Country=="SPA"]<-door_dat2$predict_lm_door_dat[door_dat$UniqueID%in%list]
list<-door_dat$UniqueID
hauls$mod1_doorspread_spa[hauls$UniqueID%in%list]<-door_dat$predict_lm_door_dat[door_dat$UniqueID%in%list]


hauls$Use_DoorSpread[is.na(hauls$DoorSpread)&
                       hauls$Country=="SPA"]<-hauls$mod1_doorspread_spa[is.na(hauls$DoorSpread)&
                                                                          hauls$Country=="SPA"]
hauls$QualityDoor[is.na(hauls$DoorSpread)&!is.na(hauls$mod1_doorspread_spa)]<-"mod1_doorpread_spa"

hauls$Use_DoorSpread[is.na(hauls$Use_DoorSpread)&
                       hauls$Country=="SPA"]<-hauls$mod2_doorspread_spa[is.na(hauls$Use_DoorSpread)&hauls$Country=="SPA"]
hauls$QualityDoor[is.na(hauls$DoorSpread)&is.na(hauls$mod1_doorspread_spa)&
                    hauls$Country=="SPA"]<-"mod2_doorpread_spa"

summary(hauls$Use_DoorSpread[hauls$Country=="SPA"])
summary(as.factor(hauls$QualityDoor[hauls$Country=="SPA"]))
png(file="DoorSpread_data_spain_col.png", bg="transparent")
plot(hauls$DepthNew[hauls$Country=="SPA"], hauls$Use_DoorSpread[hauls$Country=="SPA"],
     col=cols[as.factor(hauls$QualityDoor[hauls$Country=="SPA"])], pch=19,
     xlab="Depth (m)", ylab="DoorSpread (m)")
legend(400, 75, levels(as.factor(hauls$QualityDoor[hauls$Country=="SPA"])), 
       col=cols, 
       pch=15, ncol=1, cex=1, bty="n")
dev.off()
##########################
# 4.1.8.1.4 The NCT Gear #
##########################
# No sensors on this gear so mean is applied as supplied by ICES 2010
# DoorSpread 45.7m
hauls$DoorSpread[hauls$Gear=="NCT"]<- 45.7
# set up user values
hauls$Use_DoorSpread[hauls$Gear=="NCT"]<-45.7
hauls$QualityDoor[hauls$Gear=="NCT"]<-"mean_doorspread"
#######################
# 4.1.9.1 Net Opening #
#######################
###########################################
# 4.1.9.1.1 The GOV Otter Trawl (Model 1) #
###########################################
###################
# Net Opening GOV #
###################
plot(train$DepthNew,train$Netopening )
plot(train$WingSpread,train$Netopening )

cols<-rainbow(6)
data<-predict(lm5)
summary(as.factor(train$Survey))
png(file="GOV_Netopening_model1.png", bg="transparent")
plot(train$DepthNew, train$Netopening, col=cols[as.factor(train$Survey)], 
     pch=15, xlab="Depth (m)", ylab="Netopening(m)")
points(train$DepthNew, exp(data), pch=21, col=cols[as.factor(train$Survey)],
       bg="black")
lines(lowess(train$Netopening~train$DepthNew), col="black", lwd=2)
lines(lowess(exp(data)~train$DepthNew), col="lightgrey", lwd=3, lty=2)
legend(600, 9, levels(as.factor(hauls$Survey[hauls$Gear=="GOV"])), 
       col=cols, pch=15, ncol=1, cex=1, bty="n")
dev.off()
# note - less compex model can't explain as much variance as mixed model
# the random effects are accounting for quite a lot of variance
# set up user data using selected model
# subset all GOV gear
the_gov<-subset(hauls, Gear=="GOV")
# 30885 obs
nm1<- lmer(log(Netopening) ~ LogDepthCenter:EstSweepCat
           + (1|Ship:EstSweepCat), 
           data=train, REML=FALSE)
summary(nm1)
r.squaredGLMM(nm1)
formula(nm1)
coeffs=coefficients(nm1);coeffs
the_gov$mod1_net_gov<-exp(predict(nm1, the_gov, allow.new.levels=T))
summary(the_gov$mod1_net_gov)
# If real values are available use these
# predict results for spains net opening data
hauls$Use_Netopening[!is.na(hauls$Netopening)&
                       hauls$Gear=="GOV"]<-hauls$Netopening[!is.na(hauls$Netopening)&
                                                              hauls$Gear=="GOV"]
hauls$QualityNet[!is.na(hauls$Netopening)]<-"raw_netopening"

list<-the_gov$UniqueID

hauls$mod1_net_gov[hauls$UniqueID%in%list]<-the_gov$mod1_net_gov[the_gov$UniqueID%in%list]

hauls$Use_Netopening[is.na(hauls$Netopening)&
                       hauls$Gear=="GOV"]<-hauls$mod1_net_gov[is.na(hauls$Netopening)&
                                                                hauls$Gear=="GOV"]
hauls$QualityNet[is.na(hauls$Netopening)&!is.na(hauls$mod1_net_gov)&
                   hauls$Gear=="GOV"]<-"mod1_net_gov"

#check door spreads
summary((hauls$Use_Netopening[hauls$Gear=="GOV"]))
summary(as.factor(hauls$QualityNet[hauls$Gear=="GOV"]))

summary((hauls$Use_Netopening))
summary(as.factor(hauls$QualityNet))
summary((hauls$Use_DoorSpread))
summary(as.factor(hauls$QualityDoor))
summary((hauls$Use_WingSpread))
summary(as.factor(hauls$QualityWing))
#####################################
# 4.1.9.1.2 The ROT Trawl (Model 2) #
#####################################
# Not enough data to model this attribute - either use mean value or don't do 
# vol estimates with ROT gear - similar to Beam gears!
# Matt sent me the Standard Gear Specs- IBTS Report 2013
hauls$Use_Netopening[!is.na(hauls$Netopening)&
                       hauls$Gear=="ROT"]<-hauls$Netopening[!is.na(hauls$Netopening)&
                                                              hauls$Gear=="ROT"]
hauls$QualityNet[!is.na(hauls$Netopening)&
                   hauls$Gear=="ROT"]<-"raw_netopening"
hauls$Use_Netopening[is.na(hauls$Netopening)&hauls$Gear=="ROT"]<-3
hauls$QualityNet[is.na(hauls$Netopening)&hauls$Gear=="ROT"]<-"mean_netopening"
summary(hauls$Use_Netopening[hauls$Gear=="ROT"])
#########################################
# 4.1.9.1.3 The BAK Trawl (Model 3 & 4) #
#########################################
# door with no net 43 obs and wing 148 obs - less than 5% of the data needing estimated
# use model 3 it explains about 21 % of variation - poor in comparsion to Wing
# and Door but better than using a straight mean.
spain<-hauls[hauls$Country=="SPA", ]

net_model3<-lm(log(Netopening) ~ LogDepthCenter:SweepCat, data=spain)
summary(net_model3)
formula(net_model3)
coeffs=(coefficients(net_model3))

spain_len <- nrow(spain)
spain$predict_lm_net_dat<-exp(predict(net_model3, spain, allow.new.levels=T))
plot(spain$DepthNew,(spain$predict_lm_net_dat),
     pch=19, col=cols[as.factor(spain$Gear)])
# predict results for spains net opening data
hauls$Use_Netopening[!is.na(hauls$Netopening)&
                       hauls$Country=="SPA"]<-hauls$Netopening[!is.na(hauls$Netopening)&
                                                                 hauls$Country=="SPA"]
hauls$QualityNet[!is.na(hauls$Netopening)]<-"raw_netopening"

list<-spain$UniqueID
hauls$mod1_netopening_spa[hauls$UniqueID%in%list]<-spain$predict_lm_net_dat[spain$UniqueID%in%list]
summary(hauls$mod1_netopening_spa[hauls$Country=="SPA"])
hauls$Use_Netopening[is.na(hauls$Netopening)&
                       hauls$Country=="SPA"]<-hauls$mod1_netopening_spa[is.na(hauls$Netopening)&
                                                                          hauls$Country=="SPA"]
hauls$QualityNet[is.na(hauls$Netopening)&!is.na(hauls$mod1_netopening_spa)]<-"mod1_net_spa"

summary(hauls$Use_Netopening[hauls$Country=="SPA"])
summary(as.factor(hauls$QualityNet[hauls$Country=="SPA"]))
png(file="NetOpening_data_spain_col.png", bg="transparent")
plot(hauls$DepthNew[hauls$Country=="SPA"], hauls$Use_Netopening[hauls$Country=="SPA"],
     col=cols[as.factor(hauls$QualityNet[hauls$Country=="SPA"])], pch=19,
     xlab="Depth (m)", ylab="Net Opening (m)")
legend(0, 4, levels(as.factor(hauls$QualityNet[hauls$Country=="SPA"])), 
       col=cols, 
       pch=15, ncol=1, cex=1, bty="n")
dev.off()
summary(hauls$Use_Netopening[hauls$Country=="SPA"])
summary(hauls$Use_Netopening)
##########################
# 4.1.9.1.4 The NCT Gear #
##########################
# netopening 4.6m
hauls$Netopening[hauls$Gear=="NCT"]<-4.6
hauls$Use_Netopening[hauls$Gear=="NCT"]<-4.6
hauls$QualityNet[hauls$Gear=="NCT"]<-"mean_netopening"
summary(hauls$Use_Netopening[hauls$Gear=="NCT"])
#####################################################
# Beam Trawl DoorSpread, WingSpread, and Netopening #
#####################################################
# Beams have a set "wing spread" so set wing and door as the set width
# net opening will be set to NA
summary(as.factor(hauls$Gear))
# Netherlands
# DoorSpread 8m (BTS manual 2009)
hauls$DoorSpread[hauls$Gear=="BT8"]<- 8
# WingSpread = 8
hauls$WingSpread[hauls$Gear=="BT8"]<-8
# netopening NA
hauls$Netopening[hauls$Gear=="BT8"]<-.8
# Germany 
# DoorSpread 7.2m (BTS manual 2009)
hauls$DoorSpread[hauls$Gear=="BT7"]<-7.2
# WingSpread = 7.2
hauls$WingSpread[hauls$Gear=="BT7"]<-7.2
# netopening NA
hauls$Netopening[hauls$Gear=="BT7"]<-.6
# UK  (BTS manual 2009)
# DoorSpread 7.2m (BTS manual 2009)
hauls$DoorSpread[hauls$Gear=="BT4A"]<-4
# WingSpread = 7.2
hauls$WingSpread[hauls$Gear=="BT4A"]<-4
# netopening NA
hauls$Netopening[hauls$Gear=="BT4A"]<-.525
# setup user values 
hauls$Use_DoorSpread[hauls$Gear=="BT4A"]<-4
hauls$QualityDoor[hauls$Gear=="BT4A"]<-"raw_doorspread"
hauls$Use_WingSpread[hauls$Gear=="BT4A"]<-4
hauls$QualityWing[hauls$Gear=="BT4A"]<-"raw_wingspread"
hauls$Use_Netopening[hauls$Gear=="BT4A"]<-.525
hauls$QualityNet[hauls$Gear=="BT4A"]<-"raw_netopening"

hauls$Use_DoorSpread[hauls$Gear=="BT7"]<-7.2
hauls$QualityDoor[hauls$Gear=="BT7"]<-"raw_doorspread"
hauls$Use_WingSpread[hauls$Gear=="BT7"]<-7.2
hauls$QualityWing[hauls$Gear=="BT7"]<-"raw_wingspread"
hauls$Use_Netopening[hauls$Gear=="BT7"]<-.6
hauls$QualityNet[hauls$Gear=="BT7"]<-"raw_netopening"

hauls$Use_DoorSpread[hauls$Gear=="BT8"]<-8
hauls$QualityDoor[hauls$Gear=="BT8"]<-"raw_doorspread"
hauls$Use_WingSpread[hauls$Gear=="BT8"]<-8
hauls$QualityWing[hauls$Gear=="BT8"]<-"raw_wingspread"
hauls$Use_Netopening[hauls$Gear=="BT8"]<-.8
hauls$QualityNet[hauls$Gear=="BT8"]<-"raw_netopening"

summary(hauls$Use_WingSpread[hauls$Gear=="BT4A"])
########################################
# so at this point some of the results haven't transfered over - code reviewed
# and changes made
write.csv(hauls, "hauls_monster_file_29-09-2016.csv")
###########################################################
# 4.1.8 Calculation of the Area/Volume Swept by the Trawl #
###########################################################
summary(hauls$Use_WingSpread)
summary(hauls$newDist)

plot(hauls$DepthNew[hauls$Country=="IRL"], hauls$Use_WingSpread[hauls$Country=="IRL"])

hauls$SweptArea_wing_m_sqrd<-hauls$Use_WingSpread*hauls$newDist
hauls$SweptArea_wing_km_sqrd<-hauls$SweptArea_wing_m_sqrd/1000/1000
summary(hauls$SweptArea_wing_km_sqrd)
summary(hauls$Survey)
cols<-topo.colors(13)
plot(hauls$HaulDur,hauls$newDist, pch=19, col=cols[hauls$Survey])

plot(hauls$HaulDur,hauls$SweptArea_wing_km_sqrd, pch=19, col=cols[hauls$Survey])

16.05/68.12
16.05/0.23
hauls[, c("QualityWing_SweptArea") := list(paste0(qualityDistance, hauls$QualityWing)),]  
check_speed<-hauls$newDist/hauls$HaulDur

summary(check_speed)
185.20/1854*60

hauls[, c("Wing/Door(Ratio)"):= list(hauls$Use_WingSpread/hauls$Use_DoorSpread),]
# Save "raw" files - before all estimated data is added
write.csv(HH, "Raw_Combined_Data_29-09-2016.csv")
write.csv(HH_baka, "Raw_baka_files_29-09-2016.csv")
write.csv(HH_beams, "Raw_beams_files_29-09-2016.csv")
write.csv(HH_gov, "Raw_gov_files_29-09-2016.csv")
write.csv(HH_nct, "Raw_nct_files_29-09-2016.csv")
write.csv(HH_rot, "Raw_rot_files_29-09-2016.csv")
write.csv(HH_rot1, "Raw_rot1_files_29-09-2016.csv")
# remove from R environment
# Save monster HH Chron File
write.csv(hauls, "final_full_cleaned_hauls_29-09-2016.csv")

# take a list of haul IDs for use in the HL observation selection 
list<-unique(hauls$New_UniqueID)
list1<-unique(HL$New_UniqueID)
length(list)-length(list1)
remove<-subset(HL, !New_UniqueID%in%list,)
HL1<-subset(HL, New_UniqueID%in%list,)
list1<-unique(HL1$New_UniqueID)