Skip to content

Commit

Permalink
Improved infrastructure for word mode support.
Browse files Browse the repository at this point in the history
  • Loading branch information
JanX2 committed May 2, 2011
1 parent bd44cad commit 2f0066b
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 37 deletions.
3 changes: 2 additions & 1 deletion DiffMatchPatch.h
Expand Up @@ -158,8 +158,9 @@ typedef enum {
- (NSMutableArray *)diff_computeFromOldString:(NSString *)text1 andNewString:(NSString *)text2 checkLines:(BOOL)checklines deadline:(NSTimeInterval)deadline;
- (NSMutableArray *)diff_lineModeFromOldString:(NSString *)text1 andNewString:(NSString *)text2 deadline:(NSTimeInterval)deadline;
- (NSArray *)diff_linesToCharsForFirstString:(NSString *)text1 andSecondString:(NSString *)text1;
- (NSArray *)diff_linesToWordsForFirstString:(NSString *)text1 andSecondString:(NSString *)text1;
- (NSArray *)diff_wordsToCharsForFirstString:(NSString *)text1 andSecondString:(NSString *)text1;
- (NSString *)diff_linesToCharsMungeOfText:(NSString *)text lineArray:(NSMutableArray *)lineArray lineHash:(NSMutableDictionary *)lineHash;
- (NSString *)diff_wordsToCharsMungeOfText:(NSString *)text wordArray:(NSMutableArray *)wordArray wordHash:(NSMutableDictionary *)wordHash;
- (void)diff_chars:(NSArray *)diffs toLines:(NSMutableArray *)lineArray;
- (NSMutableArray *)diff_bisectOfOldString:(NSString *)text1 andNewString:(NSString *)text2 deadline:(NSTimeInterval)deadline;
- (NSMutableArray *)diff_bisectSplitOfOldString:(NSString *)text1 andNewString:(NSString *)text2 x:(NSUInteger)x y:(NSUInteger)y deadline:(NSTimeInterval)deadline;
Expand Down
47 changes: 32 additions & 15 deletions DiffMatchPatch.m
Expand Up @@ -643,6 +643,23 @@ - (NSString *)diff_linesToCharsMungeOfText:(NSString *)text
(CFMutableDictionaryRef)lineHash)) autorelease];
}

/**
* Split a text into a list of strings. Reduce the texts to a string of
* hashes where each Unicode character represents one word (or boundary between words).
* @param text NSString to encode.
* @param wordArray NSMutableArray of unique strings.
* @param wordHash Map of strings to indices.
* @return Encoded string.
*/
- (NSString *)diff_wordsToCharsMungeOfText:(NSString *)text
wordArray:(NSMutableArray *)wordArray
wordHash:(NSMutableDictionary *)wordHash;
{
return [NSMakeCollectable(diff_wordsToCharsMungeCFStringCreate((CFStringRef)text,
(CFMutableArrayRef)wordArray,
(CFMutableDictionaryRef)wordHash)) autorelease];
}

/**
* Find the 'middle snake' of a diff, split the problem in two
* and return the recursively constructed diff.
Expand Down Expand Up @@ -886,26 +903,26 @@ - (NSArray *)diff_linesToCharsForFirstString:(NSString *)text1
* encoded text2 and the NSMutableArray of unique strings. The zeroth element
* of the NSArray of unique strings is intentionally blank.
*/
- (NSArray *)diff_linesToWordsForFirstString:(NSString *)text1
- (NSArray *)diff_wordsToCharsForFirstString:(NSString *)text1
andSecondString:(NSString *)text2;
{
NSMutableArray *lineArray = [NSMutableArray array]; // NSString objects
NSMutableDictionary *lineHash = [NSMutableDictionary dictionary]; // keys: NSString, values:NSNumber
// e.g. [lineArray objectAtIndex:4] == "Hello\n"
// e.g. [lineHash objectForKey:"Hello\n"] == 4
NSMutableArray *wordArray = [NSMutableArray array]; // NSString objects
NSMutableDictionary *wordHash = [NSMutableDictionary dictionary]; // keys: NSString, values:NSNumber
// e.g. [wordArray objectAtIndex:4] == "Hello"
// e.g. [wordHash objectForKey:"Hello"] == 4

// "\x00" is a valid character, but various debuggers don't like it.
// So we'll insert a junk entry to avoid generating a nil character.
[lineArray addObject:@""];

NSString *words1 = (NSString *)diff_linesToWordsMungeCFStringCreate((CFStringRef)text1,
(CFMutableArrayRef)lineArray,
(CFMutableDictionaryRef)lineHash);
NSString *words2 = (NSString *)diff_linesToWordsMungeCFStringCreate((CFStringRef)text2,
(CFMutableArrayRef)lineArray,
(CFMutableDictionaryRef)lineHash);

NSArray *result = [NSArray arrayWithObjects:words1, words2, lineArray, nil];
[wordArray addObject:@""];

NSString *words1 = NSMakeCollectable(diff_wordsToCharsMungeCFStringCreate((CFStringRef)text1,
(CFMutableArrayRef)wordArray,
(CFMutableDictionaryRef)wordHash));
NSString *words2 = NSMakeCollectable(diff_wordsToCharsMungeCFStringCreate((CFStringRef)text2,
(CFMutableArrayRef)wordArray,
(CFMutableDictionaryRef)wordHash));
NSArray *result = [NSArray arrayWithObjects:words1, words2, wordArray, nil];

[words1 release];
[words2 release];
Expand Down
20 changes: 2 additions & 18 deletions DiffMatchPatchCFUtilities.c
Expand Up @@ -523,7 +523,7 @@ CFStringRef diff_linesToCharsMungeCFStringCreate(CFStringRef text, CFMutableArra
* @param lineHash Map of strings to indices.
* @return Encoded CFStringRef.
*/
CFStringRef diff_linesToWordsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash) {
CFStringRef diff_wordsToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash) {

CFStringRef token;
CFMutableStringRef chars = CFStringCreateMutable(kCFAllocatorDefault, 0);
Expand All @@ -532,10 +532,9 @@ CFStringRef diff_linesToWordsMungeCFStringCreate(CFStringRef text, CFMutableArra

//CFLocaleRef currentLocale = CFLocaleCopyCurrent();

CFOptionFlags options = kCFStringTokenizerUnitWord;
CFOptionFlags options = kCFStringTokenizerUnitWordBoundary;
CFRange tokenizerRange = CFRangeMake(0, textLength);

// The locale parameter is ignored for tokenizing by words
CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, text, tokenizerRange, options, NULL);

//CFRelease(currentLocale);
Expand All @@ -545,31 +544,16 @@ CFStringRef diff_linesToWordsMungeCFStringCreate(CFStringRef text, CFMutableArra

// Walk the text, pulling out a substring for each word (or boundary between words).
CFRange tokenRange;
CFIndex prevTokenEnd = 0;
while (mask != kCFStringTokenizerTokenNone) {
tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);

if (tokenRange.location > prevTokenEnd) {
token = diff_CFStringCreateJavaSubstring(text, prevTokenEnd, tokenRange.location);
diff_linesMungeHelper(token, tokenArray, tokenHash, chars);
CFRelease(token);
}

token = diff_CFStringCreateSubstring(text, tokenRange.location, tokenRange.length);
diff_linesMungeHelper(token, tokenArray, tokenHash, chars);
CFRelease(token);

prevTokenEnd = tokenRange.location + tokenRange.length;

mask = CFStringTokenizerAdvanceToNextToken(tokenizer);
}

if (prevTokenEnd <= textLength - 1) {
token = diff_CFStringCreateJavaSubstring(text, prevTokenEnd, textLength);
diff_linesMungeHelper(token, tokenArray, tokenHash, chars);
CFRelease(token);
}

CFRelease(tokenizer);

return chars;
Expand Down
2 changes: 1 addition & 1 deletion DiffMatchPatchCFUtilities.h
Expand Up @@ -33,7 +33,7 @@ CFArrayRef diff_halfMatchCreate(CFStringRef text1, CFStringRef text2, const floa
CFArrayRef diff_halfMatchICreate(CFStringRef longtext, CFStringRef shorttext, CFIndex i);

CFStringRef diff_linesToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef lineArray, CFMutableDictionaryRef lineHash);
CFStringRef diff_linesToWordsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash);
CFStringRef diff_wordsToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash);

CFIndex diff_cleanupSemanticScore(CFStringRef one, CFStringRef two);

Expand Down
53 changes: 51 additions & 2 deletions Tests/DiffMatchPatchTest.m
Expand Up @@ -185,7 +185,7 @@ - (void)test_diff_linesToCharsTest {
[dmp release];
}

- (void)test_diff_linesToWordsTest {
- (void)test_diff_wordsToCharsTest {
DiffMatchPatch *dmp = [DiffMatchPatch new];
NSArray *result;

Expand All @@ -196,11 +196,60 @@ - (void)test_diff_linesToWordsTest {
[tmpVector addObject:@" "];
[tmpVector addObject:@"beta"];
[tmpVector addObject:@"\n"];
result = [dmp diff_linesToWordsForFirstString:@"alpha beta alpha\n" andSecondString:@"beta alpha beta\n"];
result = [dmp diff_wordsToCharsForFirstString:@"alpha beta alpha\n" andSecondString:@"beta alpha beta\n"];
STAssertEqualObjects(@"\001\002\003\002\001\004", [result objectAtIndex:0], @"Convert words down to characters #1");
STAssertEqualObjects(@"\003\002\001\002\003\004", [result objectAtIndex:1], @"Convert words down to characters #2");
STAssertEqualObjects(tmpVector, (NSArray *)[result objectAtIndex:2], @"Convert words down to characters #3");

[tmpVector removeAllObjects];
[tmpVector addObject:@""];
[tmpVector addObject:@"alpha"];
[tmpVector addObject:@"\r"];
[tmpVector addObject:@" "];
[tmpVector addObject:@"beta"];
[tmpVector addObject:@"\r\n"];
result = [dmp diff_wordsToCharsForFirstString:@"" andSecondString:@"alpha\r beta\r \r \r\n"];
STAssertEqualObjects(@"", [result objectAtIndex:0], @"Convert words down to characters #4");
STAssertEqualObjects(@"\001\002\003\004\002\003\002\003\005", [result objectAtIndex:1], @"Convert words down to characters #5");
STAssertEqualObjects(tmpVector, (NSArray *)[result objectAtIndex:2], @"Convert words down to characters #6");

[tmpVector removeAllObjects];
[tmpVector addObject:@""];
[tmpVector addObject:@"a"];
[tmpVector addObject:@"b"];
result = [dmp diff_wordsToCharsForFirstString:@"a" andSecondString:@"b"];
STAssertEqualObjects(@"\001", [result objectAtIndex:0], @"Convert words down to characters #7");
STAssertEqualObjects(@"\002", [result objectAtIndex:1], @"Convert words down to characters #8");
STAssertEqualObjects(tmpVector, (NSArray *)[result objectAtIndex:2], @"Convert words down to characters #9");

// More than 256 to reveal any 8-bit limitations.
unichar n = 300;
[tmpVector removeAllObjects];
NSMutableString *words = [NSMutableString string];
NSMutableString *chars = [NSMutableString string];

[words appendString:@" "];

NSString *currentWord;
unichar i;
for (unichar x = 1; x < n + 1; x++) {
i = x + 1;
currentWord = [NSString stringWithFormat:@"%d ", (int)x];
[tmpVector addObject:[NSString stringWithFormat:@"%d", (int)x]];
[words appendString:currentWord];
[chars appendString:[NSString stringWithFormat:@"%C\001", i]];
}
STAssertEquals((NSUInteger)n, tmpVector.count, @"Convert words down to characters #10");
STAssertEquals((NSUInteger)n, chars.length/2, @"Convert words down to characters #11");
[tmpVector insertObject:@"" atIndex:0];
[tmpVector insertObject:@" " atIndex:1];
[chars insertString:@"\001" atIndex:0];
result = [dmp diff_wordsToCharsForFirstString:words andSecondString:@""];
NSMutableString *charsCmp = [result objectAtIndex:0];
STAssertEqualObjects(chars, charsCmp, @"Convert words down to characters #12");
STAssertEqualObjects(@"", [result objectAtIndex:1], @"Convert words down to characters #13");
STAssertEqualObjects(tmpVector, (NSArray *)[result objectAtIndex:2], @"Convert words down to characters #14");

[dmp release];
}

Expand Down

0 comments on commit 2f0066b

Please sign in to comment.