Skip to content
  • 11 commits
  • 14 files changed
  • 0 commit comments
  • 1 contributor
View
20 NSString+JXRemoving.h
@@ -0,0 +1,20 @@
+//
+// NSString+JXRemoving.h
+// string-splitter
+//
+// Created by Jan on 11.01.12.
+// Copyright 2012 geheimwerk.de. All rights reserved.
+//
+
+#import <Foundation/Foundation.h>
+
+
+@interface NSString (Removing)
+
+- (NSString *)jx_stringByRemovingPrefix:(NSString *)prefix;
+- (NSString *)jx_stringByRemovingSuffix:(NSString *)suffix;
+- (NSString *)jx_stringByRemovingSurroundingWhitespace;
+- (NSString *)jx_stringByCollapsingAndRemovingSurroundingCharactersInSet:(NSCharacterSet *)collapsibleCharacterSet
+ intoString:(NSString *)replacementString;
+
+@end
View
61 NSString+JXRemoving.m
@@ -0,0 +1,61 @@
+//
+// NSString+JXRemoving.m
+// string-splitter
+//
+// Created by Jan on 11.01.12.
+// Copyright 2012 geheimwerk.de. All rights reserved.
+//
+
+#import "NSString+JXRemoving.h"
+
+// Based on OmniFoundation/NSString-OFReplacement
+
+@implementation NSString (Removing)
+
+- (NSString *)jx_stringByRemovingPrefix:(NSString *)prefix;
+{
+ NSRange aRange = [self rangeOfString:prefix options:NSAnchoredSearch];
+ if ((aRange.length == 0) || (aRange.location != 0))
+ return [[self retain] autorelease];
+ return [self substringFromIndex:aRange.length];
+}
+
+- (NSString *)jx_stringByRemovingSuffix:(NSString *)suffix;
+{
+ if (![self hasSuffix:suffix])
+ return [[self retain] autorelease];
+ return [self substringToIndex:[self length] - [suffix length]];
+}
+
+- (NSString *)jx_stringByRemovingSurroundingWhitespace;
+{
+ return [self stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
+}
+
+- (NSString *)jx_stringByCollapsingAndRemovingSurroundingCharactersInSet:(NSCharacterSet *)collapsibleCharacterSet
+ intoString:(NSString *)replacementString;
+{
+ NSUInteger length = [self length];
+ if (length == 0) return @""; // Trivial optimization
+
+ NSScanner *stringScanner = [[NSScanner alloc] initWithString:self];
+ [stringScanner setCharactersToBeSkipped:collapsibleCharacterSet];
+ NSMutableString *collapsedString = [[NSMutableString alloc] initWithCapacity:length];
+ BOOL firstSubstring = YES;
+ NSString *nonWhitespaceSubstring;
+ while ([stringScanner scanUpToCharactersFromSet:collapsibleCharacterSet intoString:&nonWhitespaceSubstring]) {
+ if (nonWhitespaceSubstring) {
+ if (firstSubstring) {
+ firstSubstring = NO;
+ } else {
+ [collapsedString appendString:replacementString];
+ }
+ [collapsedString appendString:nonWhitespaceSubstring];
+ }
+ }
+ [stringScanner release];
+ return [collapsedString autorelease];
+}
+
+
+@end
View
26 readability.xcodeproj/project.pbxproj
@@ -16,6 +16,9 @@
3DC9BE5414F7D3D4007062BC /* KBWebArchiver.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DC9BE5214F7D3D4007062BC /* KBWebArchiver.m */; };
3DC9BE7214F93ECC007062BC /* JXReadabilityDocument.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DC9BE7114F93ECC007062BC /* JXReadabilityDocument.m */; };
3DC9BEC414FA5F12007062BC /* NSXMLNode+HTMLUtilities.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DC9BEC314FA5F12007062BC /* NSXMLNode+HTMLUtilities.m */; };
+ 3DDBC67F151DCB9600D8CF54 /* htmls.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DDBC67E151DCB9600D8CF54 /* htmls.m */; };
+ 3DDBC6AA151E306D00D8CF54 /* NSString+ReplaceExtensions.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DDBC6A9151E306D00D8CF54 /* NSString+ReplaceExtensions.m */; };
+ 3DDBC6AE151E349900D8CF54 /* NSString+JXRemoving.m in Sources */ = {isa = PBXBuildFile; fileRef = 3DDBC6AD151E349900D8CF54 /* NSString+JXRemoving.m */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
@@ -49,6 +52,12 @@
3DC9BE7114F93ECC007062BC /* JXReadabilityDocument.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = JXReadabilityDocument.m; path = readability/JXReadabilityDocument.m; sourceTree = "<group>"; };
3DC9BEC214FA5F12007062BC /* NSXMLNode+HTMLUtilities.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "NSXMLNode+HTMLUtilities.h"; path = "readability/NSXMLNode+HTMLUtilities.h"; sourceTree = "<group>"; };
3DC9BEC314FA5F12007062BC /* NSXMLNode+HTMLUtilities.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "NSXMLNode+HTMLUtilities.m"; path = "readability/NSXMLNode+HTMLUtilities.m"; sourceTree = "<group>"; };
+ 3DDBC67D151DCB9600D8CF54 /* htmls.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = htmls.h; path = readability/htmls.h; sourceTree = "<group>"; };
+ 3DDBC67E151DCB9600D8CF54 /* htmls.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = htmls.m; path = readability/htmls.m; sourceTree = "<group>"; };
+ 3DDBC6A8151E306D00D8CF54 /* NSString+ReplaceExtensions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "NSString+ReplaceExtensions.h"; path = "readability/NSString+ReplaceExtensions.h"; sourceTree = "<group>"; };
+ 3DDBC6A9151E306D00D8CF54 /* NSString+ReplaceExtensions.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "NSString+ReplaceExtensions.m"; path = "readability/NSString+ReplaceExtensions.m"; sourceTree = "<group>"; };
+ 3DDBC6AC151E349900D8CF54 /* NSString+JXRemoving.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "NSString+JXRemoving.h"; sourceTree = "<group>"; };
+ 3DDBC6AD151E349900D8CF54 /* NSString+JXRemoving.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "NSString+JXRemoving.m"; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -69,6 +78,7 @@
children = (
3DC9BE6E14F93EB8007062BC /* Classes */,
3DC9BEBE14FA5CC3007062BC /* Categories */,
+ 3DDBC675151DCB5200D8CF54 /* Other Sources */,
3D706B8E14F6ABDD008ACC2E /* readability */,
3DC9BE5514F7D3DF007062BC /* Third Party */,
3D706B9F14F6AC59008ACC2E /* Configs */,
@@ -155,10 +165,23 @@
3DC9BEC314FA5F12007062BC /* NSXMLNode+HTMLUtilities.m */,
3DACF75C15051755003A6BF7 /* NSString+Counting.h */,
3DACF75D15051755003A6BF7 /* NSString+Counting.m */,
+ 3DDBC6AC151E349900D8CF54 /* NSString+JXRemoving.h */,
+ 3DDBC6AD151E349900D8CF54 /* NSString+JXRemoving.m */,
+ 3DDBC6A8151E306D00D8CF54 /* NSString+ReplaceExtensions.h */,
+ 3DDBC6A9151E306D00D8CF54 /* NSString+ReplaceExtensions.m */,
);
name = Categories;
sourceTree = "<group>";
};
+ 3DDBC675151DCB5200D8CF54 /* Other Sources */ = {
+ isa = PBXGroup;
+ children = (
+ 3DDBC67D151DCB9600D8CF54 /* htmls.h */,
+ 3DDBC67E151DCB9600D8CF54 /* htmls.m */,
+ );
+ name = "Other Sources";
+ sourceTree = "<group>";
+ };
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
@@ -216,6 +239,9 @@
3DC9BE7214F93ECC007062BC /* JXReadabilityDocument.m in Sources */,
3DC9BEC414FA5F12007062BC /* NSXMLNode+HTMLUtilities.m in Sources */,
3DACF75E15051755003A6BF7 /* NSString+Counting.m in Sources */,
+ 3DDBC67F151DCB9600D8CF54 /* htmls.m in Sources */,
+ 3DDBC6AA151E306D00D8CF54 /* NSString+ReplaceExtensions.m in Sources */,
+ 3DDBC6AE151E349900D8CF54 /* NSString+JXRemoving.m in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
View
3 readability/JXReadabilityDocument.h
@@ -48,6 +48,9 @@
@property (nonatomic, retain) NSMutableDictionary *options;
+@property (nonatomic, readonly) NSString *title;
+@property (nonatomic, readonly) NSString *shortTitle;
+
- (id)initWithXMLDocument:(NSXMLDocument *)aDoc copyDocument:(BOOL)doCopy;
- (id)initWithXMLDocument:(NSXMLDocument *)aDoc; // Same as above with doCopy == NO
View
69 readability/JXReadabilityDocument.m
@@ -21,6 +21,7 @@
#import "JXReadabilityDocument.h"
+#import "htmls.h"
#import "NSString+Counting.h"
#import "NSXMLNode+HTMLUtilities.h"
@@ -41,10 +42,6 @@
NSString * const divToPElementsTagNamesString = @"a|blockquote|dl|div|img|ol|p|pre|table|ul";
-// Original XPath: @".//%@". Alternative XPath: @".//*[matches(name(),'%@','i')]"
-NSString * const tagNameXPath = @".//*[lower-case(name())='%@']";
-
-
NSSet * stringSetForListStringDelimitedBy(NSString *listString, NSString *delimiter);
@@ -68,11 +65,6 @@ - (id)initWithNode:(NSXMLNode *)aNode;
}
-@interface JXReadabilityDocument (Private)
-- (NSArray *)tagsIn:(NSXMLNode *)node withNames:(NSString *)firstTagName, ... NS_REQUIRES_NIL_TERMINATION;
-- (NSArray *)reverseTagsIn:(NSXMLNode *)node withNames:(NSString *)firstTagName, ... NS_REQUIRES_NIL_TERMINATION;
-@end
-
@implementation JXReadabilityDocument
@synthesize input;
@@ -147,40 +139,17 @@ - (void)dealloc
}
-- (NSArray *)tagsIn:(NSXMLNode *)node withNames:(NSString *)firstTagName, ...
+- (NSString *)title;
{
- NSMutableArray *tags = [NSMutableArray array];
-
- va_list tag_names;
- va_start (tag_names, firstTagName);
- for (NSString *tagName = firstTagName; tagName != nil; tagName = va_arg(tag_names, NSString *)) {
- NSArray *foundNodes = [node nodesForXPath:[NSString stringWithFormat:tagNameXPath, tagName]
- error:NULL];
- //foundNodes = [[foundNodes reverseObjectEnumerator] allObjects];
- [tags addObjectsFromArray:foundNodes];
- }
- va_end (tag_names);
-
- return tags;
+ return getTitleInDocument(self.html);
}
-- (NSArray *)reverseTagsIn:(NSXMLNode *)node withNames:(NSString *)firstTagName, ...
+- (NSString *)shortTitle;
{
- NSMutableArray *tags = [NSMutableArray array];
-
- va_list tag_names;
- va_start (tag_names, firstTagName);
- for (NSString *tagName = firstTagName; tagName != nil; tagName = va_arg(tag_names, NSString *)) {
- NSArray *foundNodes = [node nodesForXPath:[NSString stringWithFormat:tagNameXPath, tagName]
- error:NULL];
- foundNodes = [[foundNodes reverseObjectEnumerator] allObjects];
- [tags addObjectsFromArray:foundNodes];
- }
- va_end (tag_names);
-
- return tags;
+ return shortenTitleInDocument(self.html);
}
+
- (void)debug:(id)a
{
/*if ([(NSNumber *)[self.options objectForKey:@"debug"] boolValue]) */NSLog(@"%@", a);
@@ -221,7 +190,7 @@ - (void)transformMisusedDivsIntoParagraphs
{
NSArray *nodes;
- nodes = [self tagsIn:self.html withNames:@"div", nil];
+ nodes = [self.html tagsWithNames:@"div", nil];
for (NSXMLNode *elem in nodes) {
// Transform <div>s that do not contain other block elements into <p>s
NSXMLNode *elemNextSibling = [elem nextSibling];
@@ -245,7 +214,7 @@ - (void)transformMisusedDivsIntoParagraphs
NSXMLElement *p;
NSString *s;
- nodes = [self tagsIn:self.html withNames:@"div", nil];
+ nodes = [self.html tagsWithNames:@"div", nil];
for (NSXMLElement *elem in nodes) { // div tags always are elements
NSXMLNode *firstTextNode = [elem lxmlTextNode];
@@ -390,7 +359,7 @@ - (NSXMLDocument *)getArticleForCandidates:(NSDictionary *)candidates andBestCan
// Things like preambles, content split by ads that we removed, etc.
float siblingScoreThreshold = MAX(10.0, ([[bestCandidate objectForKey:@"contentScore"] floatValue] * 0.2));
- NSXMLDocument *output = [[[NSXMLDocument alloc] initWithXMLString:@"<html><body /></html>"
+ NSXMLDocument *output = [[[NSXMLDocument alloc] initWithXMLString:@"<html><head><title /></head><body /></html>"
options:NSXMLDocumentTidyHTML
error:NULL] autorelease];
[output setDocumentContentKind:NSXMLDocumentXHTMLKind];
@@ -490,7 +459,7 @@ - (NSDictionary *)scoreParagraphs
NSMutableDictionary *candidates = [NSMutableDictionary dictionary];
#if 0
- for (NSXMLNode *node in [self tagsIn:self.html withNames:@"div", nil]) {
+ for (NSXMLNode *node in [self.html tagsWithNames:@"div", nil]) {
[self debug:[node readabilityDescription]];
}
#endif
@@ -501,7 +470,7 @@ - (NSDictionary *)scoreParagraphs
NSMutableArray *ordered = [NSMutableArray array];
HashableElement *hashableParent, *hashableGrandParent;
- for (NSXMLElement *elem in [self tagsIn:self.html withNames:@"p", @"pre", @"td", nil]) {
+ for (NSXMLElement *elem in [self.html tagsWithNames:@"p", @"pre", @"td", nil]) {
parentNode = (NSXMLElement *)[elem parent];
if (parentNode == nil) continue;
grandParentNode = (NSXMLElement *)[parentNode parent];
@@ -587,13 +556,13 @@ - (NSXMLDocument *)sanitizeArticle:(NSXMLDocument *)node forCandidates:(NSDictio
NSNumber *minTextLengthNum = [self.options objectForKey:@"minTextLength"];
NSUInteger minLen = (minTextLengthNum != nil) ? [minTextLengthNum unsignedIntegerValue] : TEXT_LENGTH_THRESHOLD;
- for (NSXMLElement *header in [self tagsIn:node withNames:@"h1", @"h2", @"h3", @"h4", @"h5", @"h6", nil]) {
+ for (NSXMLElement *header in [node tagsWithNames:@"h1", @"h2", @"h3", @"h4", @"h5", @"h6", nil]) {
if ([self classWeight:header] < 0 || [self getLinkDensity:header] > 0.33) {
[header detach];
}
}
- for (NSXMLElement *elem in [self tagsIn:node withNames:@"form", @"iframe", @"textarea", nil]) {
+ for (NSXMLElement *elem in [node tagsWithNames:@"form", @"iframe", @"textarea", nil]) {
[elem detach];
}
@@ -616,7 +585,7 @@ - (NSXMLDocument *)sanitizeArticle:(NSXMLDocument *)node forCandidates:(NSDictio
#endif
// Conditionally clean <table>s, <ul>s, and <div>s
- for (NSXMLElement *el in [self reverseTagsIn:node withNames:@"table", @"ul", @"div", nil]) {
+ for (NSXMLElement *el in [node tagsWithNames:@"table", @"ul", @"div", nil]) {
hashableEl = [HashableElement elementForNode:el];
if (CFDictionaryContainsValue(allowed, hashableEl)) continue;
@@ -791,7 +760,7 @@ - (NSXMLDocument *)sanitizeArticle:(NSXMLDocument *)node forCandidates:(NSDictio
//[self debug:[NSString stringWithFormat:@"Allowing %@", [el readabilityDescription]]];
BOOL yesBool = YES;
- for (NSXMLElement *desnode in [self tagsIn:el withNames:@"table", @"ul", @"div", nil]) {
+ for (NSXMLElement *desnode in [el tagsWithNames:@"table", @"ul", @"div", nil]) {
CFDictionarySetValue(allowed, [HashableElement elementForNode:desnode], (void *)yesBool);
}
}
@@ -847,13 +816,13 @@ - (NSXMLDocument *)summaryXMLDocument;
}
// Delete non-content nodes
- nodes = [self tagsIn:self.html withNames:@"noscript", @"script", @"style", nil];
+ nodes = [self.html tagsWithNames:@"noscript", @"script", @"style", nil];
for (NSXMLNode *i in nodes) {
[i detach];
}
// Add readability CSS ID to body tag
- nodes = [self tagsIn:self.html withNames:@"body", nil];
+ nodes = [self.html tagsWithNames:@"body", nil];
for (NSXMLNode *i in nodes) {
[i addCSSName:@"readabilityBody" toAttributeWithName:@"id"];
}
@@ -872,6 +841,10 @@ - (NSXMLDocument *)summaryXMLDocument;
if (bestCandidate != nil) {
article = [self getArticleForCandidates:candidates
andBestCandidate:bestCandidate];
+
+ NSXMLElement *titleNode = [[article nodesForXPath:@"/html/head/title"
+ error:NULL] objectAtIndex:0];
+ [titleNode setStringValue:[self title]];
}
else {
if (ruthless) {
View
2 readability/NSString+Counting.h
@@ -11,5 +11,7 @@
@interface NSString (Counting)
- (NSUInteger)countOccurancesOfString:(NSString *)needle;
+- (NSUInteger)countSubstringsWithOptions:(NSStringEnumerationOptions)opts;
+- (BOOL)countOfSubstringsWithOptions:(NSStringEnumerationOptions)opts isAtLeast:(NSUInteger)lowerBound;
@end
View
32 readability/NSString+Counting.m
@@ -30,4 +30,36 @@ - (NSUInteger)countOccurancesOfString:(NSString *)needle;
return count;
}
+
+- (NSUInteger)countSubstringsWithOptions:(NSStringEnumerationOptions)opts;
+{
+ if (self.length == 0) return 0;
+
+ __block NSUInteger count = 0;
+
+ [self enumerateSubstringsInRange:NSMakeRange(0, self.length)
+ options:(opts | NSStringEnumerationSubstringNotRequired)
+ usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) {
+ count++;
+ }];
+
+ return count;
+}
+
+- (BOOL)countOfSubstringsWithOptions:(NSStringEnumerationOptions)opts isAtLeast:(NSUInteger)lowerBound;
+{
+ if (self.length == 0) return 0;
+
+ __block NSUInteger count = 0;
+
+ [self enumerateSubstringsInRange:NSMakeRange(0, self.length)
+ options:(opts | NSStringEnumerationSubstringNotRequired)
+ usingBlock:^(NSString *substring, NSRange substringRange, NSRange enclosingRange, BOOL *stop) {
+ count++;
+ if (count == lowerBound) *stop = YES;
+ }];
+
+ return (count >= lowerBound);
+}
+
@end
View
13 readability/NSString+ReplaceExtensions.h
@@ -0,0 +1,13 @@
+//
+// NSString+ReplaceExtensions.h
+// readability
+//
+// Created by Georg Fritzsche on 17.09.10.
+// http://stackoverflow.com/questions/3733980/replace-multiple-groups-of-characters-in-an-nsstring
+//
+
+#import <Foundation/Foundation.h>
+
+@interface NSString (ReplaceExtensions)
+- (NSString *)stringByReplacingStringsFromDictionary:(NSDictionary *)dict;
+@end
View
27 readability/NSString+ReplaceExtensions.m
@@ -0,0 +1,27 @@
+//
+// NSString+ReplaceExtensions.m
+// readability
+//
+// Created by Georg Fritzsche on 17.09.10.
+// http://stackoverflow.com/questions/3733980/replace-multiple-groups-of-characters-in-an-nsstring
+//
+
+#import "NSString+ReplaceExtensions.h"
+
+@implementation NSString (ReplaceExtensions)
+
+- (NSString *)stringByReplacingStringsFromDictionary:(NSDictionary *)dict;
+{
+ NSMutableString *string = [self mutableCopy];
+
+ for (NSString *target in dict) {
+ [string replaceOccurrencesOfString:target
+ withString:[dict objectForKey:target]
+ options:0
+ range:NSMakeRange(0, [string length])];
+ }
+
+ return [string autorelease];
+}
+
+@end
View
5 readability/NSXMLNode+HTMLUtilities.h
@@ -8,8 +8,13 @@
#import <Foundation/Foundation.h>
+extern NSString * const tagNameXPath;
+
@interface NSXMLNode (HTMLUtilities)
+- (NSArray *)tagsWithNames:(NSString *)firstTagName, ... NS_REQUIRES_NIL_TERMINATION;
+- (NSArray *)reverseTagsWithNames:(NSString *)firstTagName, ... NS_REQUIRES_NIL_TERMINATION;
+
- (void)addCSSName:(NSString *)cssName toAttributeWithName:(NSString *)attributeName;
- (NSString *)cssNamesForAttributeWithName:(NSString *)attributeName;
View
40 readability/NSXMLNode+HTMLUtilities.m
@@ -8,8 +8,48 @@
#import "NSXMLNode+HTMLUtilities.h"
+
+// Original XPath: @".//%@". Alternative XPath: @".//*[matches(name(),'%@','i')]"
+NSString * const tagNameXPath = @".//*[lower-case(name())='%@']";
+
+
@implementation NSXMLNode (HTMLUtilities)
+- (NSArray *)tagsWithNames:(NSString *)firstTagName, ... ;
+{
+ NSMutableArray *tags = [NSMutableArray array];
+
+ va_list tag_names;
+ va_start (tag_names, firstTagName);
+ for (NSString *tagName = firstTagName; tagName != nil; tagName = va_arg(tag_names, NSString *)) {
+ NSArray *foundNodes = [self nodesForXPath:[NSString stringWithFormat:tagNameXPath, tagName]
+ error:NULL];
+ //foundNodes = [[foundNodes reverseObjectEnumerator] allObjects];
+ [tags addObjectsFromArray:foundNodes];
+ }
+ va_end (tag_names);
+
+ return tags;
+}
+
+- (NSArray *)reverseTagsWithNames:(NSString *)firstTagName, ... ;
+{
+ NSMutableArray *tags = [NSMutableArray array];
+
+ va_list tag_names;
+ va_start (tag_names, firstTagName);
+ for (NSString *tagName = firstTagName; tagName != nil; tagName = va_arg(tag_names, NSString *)) {
+ NSArray *foundNodes = [self nodesForXPath:[NSString stringWithFormat:tagNameXPath, tagName]
+ error:NULL];
+ foundNodes = [[foundNodes reverseObjectEnumerator] allObjects];
+ [tags addObjectsFromArray:foundNodes];
+ }
+ va_end (tag_names);
+
+ return tags;
+}
+
+
- (void)addCSSName:(NSString *)cssName toAttributeWithName:(NSString *)attributeName;
{
if ([self kind] == NSXMLElementKind) {
View
14 readability/htmls.h
@@ -0,0 +1,14 @@
+//
+// htmls.h
+// readability
+//
+// Created by Jan on 24.03.12.
+// Copyright (c) 2012 geheimwerk.de. All rights reserved.
+//
+
+#import <Foundation/Foundation.h>
+
+NSString * lxmlCSSToXPath(NSString *cssExpr);
+void addMatch(NSMutableSet *collection, NSString *text, NSString *orig);
+NSString * getTitleInDocument(NSXMLDocument *doc);
+NSString * shortenTitleInDocument(NSXMLDocument *doc);
View
240 readability/htmls.m
@@ -0,0 +1,240 @@
+//
+// htmls.m
+// readability
+//
+// Created by Jan on 24.03.12.
+// Copyright (c) 2012 geheimwerk.de. All rights reserved.
+//
+
+#import "htmls.h"
+
+#import "NSString+Counting.h"
+#import "NSXMLNode+HTMLUtilities.h"
+#import "NSString+JXRemoving.h"
+#import "NSString+ReplaceExtensions.h"
+
+NSString * normalizeEntities(NSString *curTitle);
+NSString * normTitle(NSString *title);
+
+NSString * lxmlCSSToXPath(NSString *cssExpr) {
+ NSString *prefix = @"descendant-or-self::";
+
+ static BOOL firstRun = YES;
+ static NSRegularExpression *elRe = nil;
+ static NSRegularExpression *idRe = nil;
+ static NSRegularExpression *classRe = nil;
+
+ if (firstRun) {
+ elRe = [[NSRegularExpression alloc] initWithPattern:@"^(\\w+)\\s*$" options:0 error:NULL];
+ idRe = [[NSRegularExpression alloc] initWithPattern:@"^(\\w*)#(\\w+)\\s*$" options:0 error:NULL];
+ classRe = [[NSRegularExpression alloc] initWithPattern:@"^(\\w*)\\.(\\w+)\\s*$" options:0 error:NULL];
+ firstRun = NO;
+ }
+
+ NSString *expr = nil;
+
+ NSRange cssExprRange = NSMakeRange(0, cssExpr.length);
+ NSTextCheckingResult *match;
+
+ match = [elRe firstMatchInString:cssExpr options:0 range:cssExprRange];
+ if (match != nil) {
+ return [NSString stringWithFormat:@"%@%@", prefix, [cssExpr substringWithRange:[match rangeAtIndex:1]]];
+ }
+
+ match = [idRe firstMatchInString:cssExpr options:0 range:cssExprRange];
+ if (match != nil) {
+ NSRange match1Range = [match rangeAtIndex:1];
+ NSString *match1 = ((match1Range.location == NSNotFound) || (match1Range.length == 0)) ? @"*" : [cssExpr substringWithRange:match1Range];
+ NSString *match2 = [cssExpr substringWithRange:[match rangeAtIndex:2]];
+ NSString *result = [NSString stringWithFormat:@"%@%@[@id = '%@']", prefix, match1, match2];
+
+ return result;
+ }
+
+ match = [classRe firstMatchInString:cssExpr options:0 range:cssExprRange];
+ if (match != nil) {
+ NSRange match1Range = [match rangeAtIndex:1];
+ NSString *match1 = ((match1Range.location == NSNotFound) || (match1Range.length == 0)) ? @"*" : [cssExpr substringWithRange:match1Range];
+ NSString *match2 = [cssExpr substringWithRange:[match rangeAtIndex:2]];
+ NSString *result = [NSString stringWithFormat:@"%@%@[contains(concat(' ', normalize-space(@class), ' '), ' %@ ')]", prefix, match1, match2];
+
+ return result;
+ }
+
+ return expr;
+}
+
+
+NSString * normalizeEntities(NSString *curTitle) {
+ NSDictionary *entities = [NSDictionary dictionaryWithObjectsAndKeys:
+ @"-", @"", // EM DASH
+ @"-", @"", // EN DASH
+ @"-", @"&mdash;",
+ @"-", @"&ndash;",
+ @" ", @" ", // NO-BREAK SPACE
+ @"\"", @"«",
+ @"\"", @"»",
+ @"\"", @"&quot;",
+ nil];
+
+ return [curTitle stringByReplacingStringsFromDictionary:entities];
+}
+
+NSString * normTitle(NSString *title) {
+ return normalizeEntities([title jx_stringByCollapsingAndRemovingSurroundingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet] intoString:@" "]);
+}
+
+NSString * getTitleInDocument(NSXMLDocument *doc) {
+ NSString *title = nil;
+ NSArray *titleNodes = [doc tagsWithNames:@"title", nil];
+
+ if (titleNodes.count == 0) return @"[no-title]";
+
+ title = [[titleNodes objectAtIndex:0] lxmlText];
+
+ return normTitle(title);
+}
+
+void addMatch(NSMutableSet *collection, NSString *text, NSString *orig) {
+ text = normTitle(text);
+
+ if ((text.length >= 15) && [text countOfSubstringsWithOptions:NSStringEnumerationByWords isAtLeast:2]) {
+ NSString *textWithoutQuotes = [text stringByReplacingOccurrencesOfString:@"\"" withString:@"" options:NSLiteralSearch range:NSMakeRange(0, text.length)];
+ NSString *origWithoutQuotes = [orig stringByReplacingOccurrencesOfString:@"\"" withString:@"" options:NSLiteralSearch range:NSMakeRange(0, orig.length)];
+
+ if (([origWithoutQuotes rangeOfString:textWithoutQuotes
+ options:NSLiteralSearch
+ range:NSMakeRange(0, origWithoutQuotes.length)].location) != NSNotFound) {
+ [collection addObject:text];
+ }
+
+ }
+}
+
+NSString * shortenTitleInDocument(NSXMLDocument *doc) {
+ static BOOL firstRun = YES;
+ static NSArray *cssXPaths = nil;
+ static NSArray *delimiters = nil;
+
+ if (firstRun) {
+ NSArray *cssSelectors = [NSArray arrayWithObjects:@"#title", @"#head", @"#heading", @".pageTitle", @".newsTitle", @".title", @".head", @".heading", @".contentheading", @".smallHeaderRed", nil];
+
+ NSMutableArray *cssXPathsMutable = [[NSMutableArray alloc] initWithCapacity:cssSelectors.count];
+
+ for (NSString *selector in cssSelectors) {
+ [cssXPathsMutable addObject:lxmlCSSToXPath(selector)];
+ }
+
+ cssXPaths = [cssXPathsMutable copy];
+ [cssXPathsMutable release];
+
+ delimiters = [NSArray arrayWithObjects:@" | ", @" - ", @" :: ", @" / ", nil];
+
+ firstRun = NO;
+ }
+
+ NSString *title = nil;
+ NSArray *titleNodes = [doc tagsWithNames:@"title", nil];
+
+ if (titleNodes.count == 0) return @"";
+
+ title = [[titleNodes objectAtIndex:0] lxmlText];
+
+ NSString *orig;
+ title = orig = normTitle(title);
+
+#warning How does NSXML treat HTML entities?
+
+ NSMutableSet *candidates = [NSMutableSet set];
+
+ for (NSXMLElement *e in [doc tagsWithNames:@"h1", @"h2", @"h3", nil]) {
+ NSString *eText;
+
+ eText = e.lxmlText;
+ if (eText) {
+ addMatch(candidates, eText, orig);
+ }
+
+ eText = e.stringValue;
+ if (eText) {
+ addMatch(candidates, eText, orig);
+ }
+ }
+
+ for (NSString *item in cssXPaths) {
+ NSArray *foundNodes = [doc nodesForXPath:item
+ error:NULL];
+
+ for (NSXMLElement *e in foundNodes) {
+ NSString *eText;
+
+ eText = e.lxmlText;
+ if (eText) {
+ addMatch(candidates, eText, orig);
+ }
+
+ eText = e.stringValue;
+ if (eText) {
+ addMatch(candidates, eText, orig);
+ }
+ }
+ }
+
+ if (candidates) {
+ NSSortDescriptor *candidatesAscendingDescriptor = [NSSortDescriptor sortDescriptorWithKey:@"length"
+ ascending:YES];
+
+ NSArray *sortedCandidates = [[candidates allObjects] sortedArrayUsingDescriptors:
+ [NSArray arrayWithObject:candidatesAscendingDescriptor]];
+
+
+ title = [sortedCandidates lastObject];
+ }
+ else {
+ NSArray *parts;
+ BOOL didBreak = NO;
+
+ for (NSString *delimiter in delimiters) {
+ if ([title rangeOfString:delimiter
+ options:NSLiteralSearch].location != NSNotFound) {
+ parts = [orig componentsSeparatedByString:delimiter];
+
+ NSString *titleCandidate;
+ if (titleCandidate = [parts objectAtIndex:0],
+ [titleCandidate countOfSubstringsWithOptions:NSStringEnumerationByWords isAtLeast:4]) {
+ title = titleCandidate;
+ didBreak = YES;
+ break;
+ }
+ else if (titleCandidate = [parts lastObject],
+ [titleCandidate countOfSubstringsWithOptions:NSStringEnumerationByWords isAtLeast:4]) {
+ title = titleCandidate;
+ didBreak = YES;
+ break;
+ }
+ }
+ }
+
+ if (didBreak == NO) {
+ NSString *delimiter = @": ";
+ if ([title rangeOfString:delimiter
+ options:NSLiteralSearch].location != NSNotFound) {
+ parts = [orig componentsSeparatedByString:delimiter];
+
+ NSString *titleCandidate;
+ if (titleCandidate = [parts lastObject],
+ [titleCandidate countOfSubstringsWithOptions:NSStringEnumerationByWords isAtLeast:4]) {
+ title = [parts lastObject];
+ }
+ else {
+ title = [[parts subarrayWithRange:NSMakeRange(1, (parts.count - 1))] componentsJoinedByString:delimiter];
+ }
+ }
+ }
+ }
+
+ NSUInteger titleLength = title.length;
+ if ( !((15 < titleLength) && (titleLength < 150)) ) return orig;
+
+ return title;
+}
View
13 readability/main.m
@@ -15,15 +15,16 @@
#import "JXReadabilityDocument.h"
-BOOL dumpXMLDocumentToPath(NSXMLDocument *doc, NSString *output, NSUInteger xmlOutputOptions, NSString *tag, NSError **error);
+BOOL dumpXMLDocumentToPath(NSXMLDocument *doc, NSString *output, NSUInteger xmlOutputOptions, NSString *tag, NSError **error);
-BOOL dumpXMLDocumentToPath(NSXMLDocument *doc, NSString *output, NSUInteger xmlOutputOptions, NSString *tag, NSError **error) {
+BOOL dumpXMLDocumentToPath(NSXMLDocument *doc, NSString *output, NSUInteger xmlOutputOptions, NSString *tag, NSError **error) {
if (output == nil) return NO;
NSString *outputPath = nil;
if (tag == nil) {
- outputPath = output;
+ outputPath = [[output stringByDeletingPathExtension]
+ stringByAppendingPathExtension:@"html"];
} else {
outputPath = [[[output stringByDeletingPathExtension]
stringByAppendingString:tag]
@@ -174,6 +175,10 @@ int main(int argc, const char * argv[])
copyDocument:NO];
summaryDoc = [readabilityDoc summaryXMLDocument];
cleanedDoc = readabilityDoc.html;
+
+ //NSLog(@"\nTitle: %@", readabilityDoc.title);
+ //NSLog(@"\nShort Title: %@", readabilityDoc.shortTitle);
+
[readabilityDoc release];
}
@@ -195,7 +200,7 @@ int main(int argc, const char * argv[])
BOOL success;
// Create a new webarchive with the processed markup as main content and the resources from the source webarchive
- NSData *docData = [doc XMLDataWithOptions:xmlOutputOptions];
+ NSData *docData = [summaryDoc XMLDataWithOptions:xmlOutputOptions];
WebResource *mainResource = [[WebResource alloc] initWithData:docData
URL:[resource URL]
MIMEType:[resource MIMEType]

No commit comments for this range

Something went wrong with that request. Please try again.