Skip to content

Commit

Permalink
Add a link check mode (#50)
Browse files Browse the repository at this point in the history
* Add a linkcheck runmode.

* Don't use the expander if runmode is linkcheck.

* Progress on broken link checking.

* Implement linkcheck results.

* Annotate results with links.

* Better logging, run the pageworker.

* Fix import case.

* Fix tests.

* Swap import case.

* Improve test coverage.

* Even more tests.
  • Loading branch information
Matir committed Oct 3, 2018
1 parent e2d1425 commit be01969
Show file tree
Hide file tree
Showing 10 changed files with 434 additions and 17 deletions.
15 changes: 11 additions & 4 deletions main.go
Expand Up @@ -101,10 +101,15 @@ func main() {
case ss.RunModeDotProduct:
dpexpander := filter.NewDotProductExpander(words)
expander = dpexpander
case ss.RunModeLinkCheck:
// No expander needed
default:
panic("Unknown run mode!")
}
expander.SetAddCount(queue.GetAddCount())

if expander != nil {
expander.SetAddCount(queue.GetAddCount())
}

headerExpander := filter.NewHeaderExpander(settings.OptionalHeader.Header())
headerExpander.SetAddCount(queue.GetAddCount())
Expand All @@ -121,9 +126,11 @@ func main() {
// filter paths after expansion
logging.Debugf("Starting expansion and filtering...")
workChan := queue.GetWorkChan()
workChan = expander.Expand(workChan)
workChan = headerExpander.Expand(workChan)
workChan = extensionExpander.Expand(workChan)
if expander != nil {
workChan = expander.Expand(workChan)
workChan = headerExpander.Expand(workChan)
workChan = extensionExpander.Expand(workChan)
}
workChan = filter.RunFilter(workChan)

logging.Logf(logging.LogDebug, "Creating results manager...")
Expand Down
222 changes: 222 additions & 0 deletions results/linkcheck.go
@@ -0,0 +1,222 @@
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package results

import (
"encoding/csv"
"fmt"
"github.com/Matir/webborer/logging"
"html/template"
"io"
"os"
"sort"
)

// Check results for broken links.
type LinkCheckResultsManager struct {
baseResultsManager
writer io.Writer
fp *os.File
format string
resMap map[string]*Result
missing int
writerImpl linkCheckWriter
baseURL string
}

func (rm *LinkCheckResultsManager) init() error {
rm.resMap = make(map[string]*Result)
switch rm.format {
case "text":
rm.format = "csv"
fallthrough
case "csv":
rm.writerImpl = newLinkCheckCSVWriter(rm.writer)
case "html":
rm.writerImpl = newLinkCheckHTMLWriter(rm.writer)
default:
return fmt.Errorf("Invalid format: %s", rm.format)
}
return nil
}

func (rm *LinkCheckResultsManager) Run(resChan <-chan *Result) {
rm.start()
go func() {
defer func() {
rm.writerImpl.flush()
if rm.fp != nil {
rm.fp.Close()
}
rm.done()
}()

var keys []string
for res := range resChan {
key := res.URL.String()
rm.resMap[key] = res
keys = append(keys, key)
}
sort.Strings(keys)
rm.writerImpl.writeHeader(rm.baseURL)
count := 0

for _, resKey := range keys {
groupWritten := false
res := rm.resMap[resKey]
for k, t := range res.Links {
if rm.linkIsBroken(k) {
if !groupWritten {
groupWritten = true
rm.writerImpl.writeGroup(k)
}
rm.writerImpl.writeBrokenLink(resKey, k, LinkTypes[t])
count++
}
}
}

rm.writerImpl.writeFooter(count)
}()
}

// Check if an HTTP code is broken, consider all 400/500s
func codeIsBroken(code int) bool {
return code >= 400
}

func (rm *LinkCheckResultsManager) linkIsBroken(url string) bool {
if r, ok := rm.resMap[url]; !ok {
rm.missing++
return false
} else {
return codeIsBroken(r.Code)
}
}

type linkCheckWriter interface {
writeHeader(string)
writeFooter(int)
writeGroup(string)
writeBrokenLink(src, dst, ltype string)
flush()
}

// Write link check output in CSV
type linkCheckCSVWriter struct {
csvWriter *csv.Writer
}

func newLinkCheckCSVWriter(writer io.Writer) *linkCheckCSVWriter {
return &linkCheckCSVWriter{csv.NewWriter(writer)}
}

func (w *linkCheckCSVWriter) writeHeader(_ string) {
w.csvWriter.Write([]string{"Source URL", "Destination URL", "Type"})
}

func (w *linkCheckCSVWriter) writeFooter(count int) {
return
}

func (w *linkCheckCSVWriter) writeGroup(src string) {
return
}

func (w *linkCheckCSVWriter) writeBrokenLink(src, dst, ltype string) {
w.csvWriter.Write([]string{src, dst, ltype})
}

func (w *linkCheckCSVWriter) flush() {
w.csvWriter.Flush()
}

// Write link check output in HTML
type linkCheckHTMLWriter struct {
writer io.Writer
}

func newLinkCheckHTMLWriter(writer io.Writer) *linkCheckHTMLWriter {
return &linkCheckHTMLWriter{writer}
}

func (w *linkCheckHTMLWriter) writeHeader(baseURL string) {
header := `{{define "HEAD"}}<html><head><title>webborer: linkCheck for {{.BaseURL}}</title></head><body><h1>webborer: linkCheck for {{.BaseURL}}</h1><table>{{end}}`
t, err := template.New("linkCheckHTMLWriter").Parse(header)
if err != nil {
logging.Logf(logging.LogWarning, "Error parsing a template: %s", err.Error())
}
data := struct {
BaseURL string
}{
baseURL,
}
if err := t.ExecuteTemplate(w.writer, "HEAD", data); err != nil {
logging.Logf(logging.LogWarning, "Error writing template output: %s", err.Error())
}
}

func (w *linkCheckHTMLWriter) writeFooter(count int) {
footer := `{{define "FOOTER"}}</table><p>Total Broken Links Found: <b>{{.Count}}</b></html>{{end}}`
t, err := template.New("linkCheckHTMLWriter").Parse(footer)
if err != nil {
logging.Logf(logging.LogWarning, "Error parsing a template: %s", err.Error())
}
data := struct {
Count int
}{
count,
}
if err := t.ExecuteTemplate(w.writer, "FOOTER", data); err != nil {
logging.Logf(logging.LogWarning, "Error writing template output: %s", err.Error())
}
}

func (w *linkCheckHTMLWriter) writeGroup(src string) {
group := `{{define "GROUP"}}<tr class='source'><td colspan='2'><a href='{{.Link}}'>{{.Link}}</a></td></tr>{{end}}`
t, err := template.New("linkCheckHTMLWriter").Parse(group)
if err != nil {
logging.Logf(logging.LogWarning, "Error parsing a template: %s", err.Error())
}
data := struct {
Link string
}{
src,
}
if err := t.ExecuteTemplate(w.writer, "GROUP", data); err != nil {
logging.Logf(logging.LogWarning, "Error writing template output: %s", err.Error())
}
}

func (w *linkCheckHTMLWriter) writeBrokenLink(src, dst, ltype string) {
link := `{{define "LINK"}}<tr class='broken'><td><a href='{{.Dest}}'>{{.Dest}}</a></td><td>{{.LType}}</td></tr>{{end}}`
t, err := template.New("linkCheckHTMLWriter").Parse(link)
if err != nil {
logging.Logf(logging.LogWarning, "Error parsing a template: %s", err.Error())
}
data := struct {
Dest string
LType string
}{
dst,
ltype,
}
if err := t.ExecuteTemplate(w.writer, "LINK", data); err != nil {
logging.Logf(logging.LogWarning, "Error writing template output: %s", err.Error())
}
}

func (w *linkCheckHTMLWriter) flush() {
}
98 changes: 98 additions & 0 deletions results/linkcheck_test.go
@@ -0,0 +1,98 @@
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package results

import (
"bytes"
"net/http"
"strings"
"testing"
)

func TestInitLinkCheck(t *testing.T) {
lcrm := &LinkCheckResultsManager{
writer: &bytes.Buffer{},
}
if err := lcrm.init(); err == nil {
t.Error("Expected error for missing format.")
}
lcrm.format = "html"
if err := lcrm.init(); err != nil {
t.Error("Did not expect an error.")
}
if _, ok := lcrm.writerImpl.(*linkCheckHTMLWriter); !ok {
t.Error("Expected an HTML writer.")
}
if lcrm.resMap == nil {
t.Error("Expected resMap to be initialized.")
}
lcrm.format = "csv"
if err := lcrm.init(); err != nil {
t.Error("Did not expect an error.")
}
if _, ok := lcrm.writerImpl.(*linkCheckCSVWriter); !ok {
t.Error("Expected a CSV writer.")
}
lcrm.format = "text"
if err := lcrm.init(); err != nil {
t.Error("Did not expect an error.")
}
if _, ok := lcrm.writerImpl.(*linkCheckCSVWriter); !ok {
t.Error("Expected a CSV writer.")
}
if lcrm.format != "csv" {
t.Error("Expected text format to become csv.")
}
}

func TestCodeIsBroken(t *testing.T) {
if codeIsBroken(http.StatusOK) {
t.Error("StatusOK is not broken.")
}
if !codeIsBroken(http.StatusNotFound) {
t.Error("StatusNotFound should be broken.")
}
}

func exerciseLinkCheckWriter(w linkCheckWriter) {
w.writeHeader("http://localhost/")
w.writeGroup("src")
w.writeBrokenLink("src", "borked", "")
w.writeFooter(55)
w.flush()
}

func TestCSVWriter(t *testing.T) {
buf := &bytes.Buffer{}
w := newLinkCheckCSVWriter(buf)
exerciseLinkCheckWriter(w)
out := buf.String()
if !strings.Contains(out, "src,borked,") {
t.Error("Expected src,borked,")
}
}

func TestHTMLWriter(t *testing.T) {
buf := &bytes.Buffer{}
w := newLinkCheckHTMLWriter(buf)
exerciseLinkCheckWriter(w)
out := buf.String()
if !strings.Contains(out, "<a href='src'>src</a>") {
t.Error("Expected link to src!")
}
if !strings.Contains(out, "<a href='borked'>borked</a>") {
t.Error("Expected link to borked!")
}
}

0 comments on commit be01969

Please sign in to comment.