Skip to content

Commit

Permalink
Improve code diff highlight, fix incorrect rendered diff result (go-g…
Browse files Browse the repository at this point in the history
…itea#19958)

Use Unicode placeholders to replace HTML tags and HTML entities first, then do diff, then recover the HTML tags and HTML entities. Now the code diff with highlight has stable behavior, and won't emit broken tags.
  • Loading branch information
wxiaoguang authored and Sysoev, Vladimir committed Aug 10, 2022
1 parent def2d1d commit 0222243
Show file tree
Hide file tree
Showing 5 changed files with 379 additions and 378 deletions.
8 changes: 5 additions & 3 deletions modules/highlight/highlight.go
Expand Up @@ -40,9 +40,11 @@ var (
// NewContext loads custom highlight map from local config
func NewContext() {
once.Do(func() {
keys := setting.Cfg.Section("highlight.mapping").Keys()
for i := range keys {
highlightMapping[keys[i].Name()] = keys[i].Value()
if setting.Cfg != nil {
keys := setting.Cfg.Section("highlight.mapping").Keys()
for i := range keys {
highlightMapping[keys[i].Name()] = keys[i].Value()
}
}

// The size 512 is simply a conservative rule of thumb
Expand Down
312 changes: 21 additions & 291 deletions services/gitdiff/gitdiff.go
Expand Up @@ -15,7 +15,6 @@ import (
"io"
"net/url"
"os"
"regexp"
"sort"
"strings"
"time"
Expand All @@ -40,7 +39,7 @@ import (
"golang.org/x/text/transform"
)

// DiffLineType represents the type of a DiffLine.
// DiffLineType represents the type of DiffLine.
type DiffLineType uint8

// DiffLineType possible values.
Expand All @@ -51,7 +50,7 @@ const (
DiffLineSection
)

// DiffFileType represents the type of a DiffFile.
// DiffFileType represents the type of DiffFile.
type DiffFileType uint8

// DiffFileType possible values.
Expand Down Expand Up @@ -100,12 +99,12 @@ type DiffLineSectionInfo struct {
// BlobExcerptChunkSize represent max lines of excerpt
const BlobExcerptChunkSize = 20

// GetType returns the type of a DiffLine.
// GetType returns the type of DiffLine.
func (d *DiffLine) GetType() int {
return int(d.Type)
}

// CanComment returns whether or not a line can get commented
// CanComment returns whether a line can get commented
func (d *DiffLine) CanComment() bool {
return len(d.Comments) == 0 && d.Type != DiffLineSection
}
Expand Down Expand Up @@ -191,287 +190,13 @@ var (
codeTagSuffix = []byte(`</span>`)
)

var (
unfinishedtagRegex = regexp.MustCompile(`<[^>]*$`)
trailingSpanRegex = regexp.MustCompile(`<span\s*[[:alpha:]="]*?[>]?$`)
entityRegex = regexp.MustCompile(`&[#]*?[0-9[:alpha:]]*$`)
)

// shouldWriteInline represents combinations where we manually write inline changes
func shouldWriteInline(diff diffmatchpatch.Diff, lineType DiffLineType) bool {
if true &&
diff.Type == diffmatchpatch.DiffEqual ||
diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd ||
diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel {
return true
}
return false
}

func fixupBrokenSpans(diffs []diffmatchpatch.Diff) []diffmatchpatch.Diff {
// Create a new array to store our fixed up blocks
fixedup := make([]diffmatchpatch.Diff, 0, len(diffs))

// semantically label some numbers
const insert, delete, equal = 0, 1, 2

// record the positions of the last type of each block in the fixedup blocks
last := []int{-1, -1, -1}
operation := []diffmatchpatch.Operation{diffmatchpatch.DiffInsert, diffmatchpatch.DiffDelete, diffmatchpatch.DiffEqual}

// create a writer for insert and deletes
toWrite := []strings.Builder{
{},
{},
}

// make some flags for insert and delete
unfinishedTag := []bool{false, false}
unfinishedEnt := []bool{false, false}

// store stores the provided text in the writer for the typ
store := func(text string, typ int) {
(&(toWrite[typ])).WriteString(text)
}

// hasStored returns true if there is stored content
hasStored := func(typ int) bool {
return (&toWrite[typ]).Len() > 0
}

// stored will return that content
stored := func(typ int) string {
return (&toWrite[typ]).String()
}

// empty will empty the stored content
empty := func(typ int) {
(&toWrite[typ]).Reset()
}

// pop will remove the stored content appending to a diff block for that typ
pop := func(typ int, fixedup []diffmatchpatch.Diff) []diffmatchpatch.Diff {
if hasStored(typ) {
if last[typ] > last[equal] {
fixedup[last[typ]].Text += stored(typ)
} else {
fixedup = append(fixedup, diffmatchpatch.Diff{
Type: operation[typ],
Text: stored(typ),
})
}
empty(typ)
}
return fixedup
}

// Now we walk the provided diffs and check the type of each block in turn
for _, diff := range diffs {

typ := delete // flag for handling insert or delete typs
switch diff.Type {
case diffmatchpatch.DiffEqual:
// First check if there is anything stored
if hasStored(insert) || hasStored(delete) {
// There are two reasons for storing content:
// 1. Unfinished Entity <- Could be more efficient here by not doing this if we're looking for a tag
if unfinishedEnt[insert] || unfinishedEnt[delete] {
// we look for a ';' to finish an entity
idx := strings.IndexRune(diff.Text, ';')
if idx >= 0 {
// if we find a ';' store the preceding content to both insert and delete
store(diff.Text[:idx+1], insert)
store(diff.Text[:idx+1], delete)

// and remove it from this block
diff.Text = diff.Text[idx+1:]

// reset the ent flags
unfinishedEnt[insert] = false
unfinishedEnt[delete] = false
} else {
// otherwise store it all on insert and delete
store(diff.Text, insert)
store(diff.Text, delete)
// and empty this block
diff.Text = ""
}
}
// 2. Unfinished Tag
if unfinishedTag[insert] || unfinishedTag[delete] {
// we look for a '>' to finish a tag
idx := strings.IndexRune(diff.Text, '>')
if idx >= 0 {
store(diff.Text[:idx+1], insert)
store(diff.Text[:idx+1], delete)
diff.Text = diff.Text[idx+1:]
unfinishedTag[insert] = false
unfinishedTag[delete] = false
} else {
store(diff.Text, insert)
store(diff.Text, delete)
diff.Text = ""
}
}

// If we've completed the required tag/entities
if !(unfinishedTag[insert] || unfinishedTag[delete] || unfinishedEnt[insert] || unfinishedEnt[delete]) {
// pop off the stack
fixedup = pop(insert, fixedup)
fixedup = pop(delete, fixedup)
}

// If that has left this diff block empty then shortcut
if len(diff.Text) == 0 {
continue
}
}

// check if this block ends in an unfinished tag?
idx := unfinishedtagRegex.FindStringIndex(diff.Text)
if idx != nil {
unfinishedTag[insert] = true
unfinishedTag[delete] = true
} else {
// otherwise does it end in an unfinished entity?
idx = entityRegex.FindStringIndex(diff.Text)
if idx != nil {
unfinishedEnt[insert] = true
unfinishedEnt[delete] = true
}
}

// If there is an unfinished component
if idx != nil {
// Store the fragment
store(diff.Text[idx[0]:], insert)
store(diff.Text[idx[0]:], delete)
// and remove it from this block
diff.Text = diff.Text[:idx[0]]
}

// If that hasn't left the block empty
if len(diff.Text) > 0 {
// store the position of the last equal block and store it in our diffs
last[equal] = len(fixedup)
fixedup = append(fixedup, diff)
}
continue
case diffmatchpatch.DiffInsert:
typ = insert
fallthrough
case diffmatchpatch.DiffDelete:
// First check if there is anything stored for this type
if hasStored(typ) {
// if there is prepend it to this block, empty the storage and reset our flags
diff.Text = stored(typ) + diff.Text
empty(typ)
unfinishedEnt[typ] = false
unfinishedTag[typ] = false
}

// check if this block ends in an unfinished tag
idx := unfinishedtagRegex.FindStringIndex(diff.Text)
if idx != nil {
unfinishedTag[typ] = true
} else {
// otherwise does it end in an unfinished entity
idx = entityRegex.FindStringIndex(diff.Text)
if idx != nil {
unfinishedEnt[typ] = true
}
}

// If there is an unfinished component
if idx != nil {
// Store the fragment
store(diff.Text[idx[0]:], typ)
// and remove it from this block
diff.Text = diff.Text[:idx[0]]
}

// If that hasn't left the block empty
if len(diff.Text) > 0 {
// if the last block of this type was after the last equal block
if last[typ] > last[equal] {
// store this blocks content on that block
fixedup[last[typ]].Text += diff.Text
} else {
// otherwise store the position of the last block of this type and store the block
last[typ] = len(fixedup)
fixedup = append(fixedup, diff)
}
}
continue
}
}

// pop off any remaining stored content
fixedup = pop(insert, fixedup)
fixedup = pop(delete, fixedup)

return fixedup
}

func diffToHTML(fileName string, diffs []diffmatchpatch.Diff, lineType DiffLineType) DiffInline {
func diffToHTML(lineWrapperTags []string, diffs []diffmatchpatch.Diff, lineType DiffLineType) string {
buf := bytes.NewBuffer(nil)
match := ""

diffs = fixupBrokenSpans(diffs)

// restore the line wrapper tags <span class="line"> and <span class="cl">, if necessary
for _, tag := range lineWrapperTags {
buf.WriteString(tag)
}
for _, diff := range diffs {
if shouldWriteInline(diff, lineType) {
if len(match) > 0 {
diff.Text = match + diff.Text
match = ""
}
// Chroma HTML syntax highlighting is done before diffing individual lines in order to maintain consistency.
// Since inline changes might split in the middle of a chroma span tag or HTML entity, make we manually put it back together
// before writing so we don't try insert added/removed code spans in the middle of one of those
// and create broken HTML. This is done by moving incomplete HTML forward until it no longer matches our pattern of
// a line ending with an incomplete HTML entity or partial/opening <span>.

// EX:
// diffs[{Type: dmp.DiffDelete, Text: "language</span><span "},
// {Type: dmp.DiffEqual, Text: "c"},
// {Type: dmp.DiffDelete, Text: "lass="p">}]

// After first iteration
// diffs[{Type: dmp.DiffDelete, Text: "language</span>"}, //write out
// {Type: dmp.DiffEqual, Text: "<span c"},
// {Type: dmp.DiffDelete, Text: "lass="p">,</span>}]

// After second iteration
// {Type: dmp.DiffEqual, Text: ""}, // write out
// {Type: dmp.DiffDelete, Text: "<span class="p">,</span>}]

// Final
// {Type: dmp.DiffDelete, Text: "<span class="p">,</span>}]
// end up writing <span class="removed-code"><span class="p">,</span></span>
// Instead of <span class="removed-code">lass="p",</span></span>

m := trailingSpanRegex.FindStringSubmatchIndex(diff.Text)
if m != nil {
match = diff.Text[m[0]:m[1]]
diff.Text = strings.TrimSuffix(diff.Text, match)
}
m = entityRegex.FindStringSubmatchIndex(diff.Text)
if m != nil {
match = diff.Text[m[0]:m[1]]
diff.Text = strings.TrimSuffix(diff.Text, match)
}
// Print an existing closing span first before opening added/remove-code span so it doesn't unintentionally close it
if strings.HasPrefix(diff.Text, "</span>") {
buf.WriteString("</span>")
diff.Text = strings.TrimPrefix(diff.Text, "</span>")
}
// If we weren't able to fix it then this should avoid broken HTML by not inserting more spans below
// The previous/next diff section will contain the rest of the tag that is missing here
if strings.Count(diff.Text, "<") != strings.Count(diff.Text, ">") {
buf.WriteString(diff.Text)
continue
}
}
switch {
case diff.Type == diffmatchpatch.DiffEqual:
buf.WriteString(diff.Text)
Expand All @@ -485,7 +210,10 @@ func diffToHTML(fileName string, diffs []diffmatchpatch.Diff, lineType DiffLineT
buf.Write(codeTagSuffix)
}
}
return DiffInlineWithUnicodeEscape(template.HTML(buf.String()))
for range lineWrapperTags {
buf.WriteString("</span>")
}
return buf.String()
}

// GetLine gets a specific line by type (add or del) and file line number
Expand Down Expand Up @@ -597,10 +325,12 @@ func (diffSection *DiffSection) GetComputedInlineDiffFor(diffLine *DiffLine) Dif
return DiffInlineWithHighlightCode(diffSection.FileName, language, diffLine.Content)
}

diffRecord := diffMatchPatch.DiffMain(highlight.Code(diffSection.FileName, language, diff1[1:]), highlight.Code(diffSection.FileName, language, diff2[1:]), true)
diffRecord = diffMatchPatch.DiffCleanupEfficiency(diffRecord)

return diffToHTML(diffSection.FileName, diffRecord, diffLine.Type)
hcd := newHighlightCodeDiff()
diffRecord := hcd.diffWithHighlight(diffSection.FileName, language, diff1[1:], diff2[1:])
// it seems that Gitea doesn't need the line wrapper of Chroma, so do not add them back
// if the line wrappers are still needed in the future, it can be added back by "diffToHTML(hcd.lineWrapperTags. ...)"
diffHTML := diffToHTML(nil, diffRecord, diffLine.Type)
return DiffInlineWithUnicodeEscape(template.HTML(diffHTML))
}

// DiffFile represents a file diff.
Expand Down Expand Up @@ -1289,7 +1019,7 @@ func readFileName(rd *strings.Reader) (string, bool) {
if char == '"' {
fmt.Fscanf(rd, "%q ", &name)
if len(name) == 0 {
log.Error("Reader has no file name: %v", rd)
log.Error("Reader has no file name: reader=%+v", rd)
return "", true
}

Expand All @@ -1311,7 +1041,7 @@ func readFileName(rd *strings.Reader) (string, bool) {
}
}
if len(name) < 2 {
log.Error("Unable to determine name from reader: %v", rd)
log.Error("Unable to determine name from reader: reader=%+v", rd)
return "", true
}
return name[2:], ambiguity
Expand Down

0 comments on commit 0222243

Please sign in to comment.