From 02ede0b62f0d47be2762323c1469a142673f6a4f Mon Sep 17 00:00:00 2001 From: KoalaBear Date: Fri, 27 Jan 2023 17:51:52 +0100 Subject: [PATCH] - Support `--force-js` for Copyparty --- .../DirectoryParser.cs | 47 +----- .../Site/Copyparty/CopypartyParser.cs | 154 ++++++++++++++++++ .../Site/Copyparty/CopypartyResult.cs | 71 ++++++++ 3 files changed, 229 insertions(+), 43 deletions(-) create mode 100644 src/OpenDirectoryDownloader/Site/Copyparty/CopypartyParser.cs create mode 100644 src/OpenDirectoryDownloader/Site/Copyparty/CopypartyResult.cs diff --git a/src/OpenDirectoryDownloader/DirectoryParser.cs b/src/OpenDirectoryDownloader/DirectoryParser.cs index 863194e6..abc0a341 100644 --- a/src/OpenDirectoryDownloader/DirectoryParser.cs +++ b/src/OpenDirectoryDownloader/DirectoryParser.cs @@ -9,6 +9,7 @@ using OpenDirectoryDownloader.Shared; using OpenDirectoryDownloader.Shared.Models; using OpenDirectoryDownloader.Site.BlitzfilesTech; +using OpenDirectoryDownloader.Site.Copyparty; using OpenDirectoryDownloader.Site.Dropbox; using OpenDirectoryDownloader.Site.GDIndex; using OpenDirectoryDownloader.Site.GDIndex.Bhadoo; @@ -269,7 +270,7 @@ await foreach (string source in sources) // copyparty if (htmlDocument.QuerySelector("#op_bup #u2err") is not null) { - return ParseCopypartyListing(baseUrl, parsedWebDirectory, htmlDocument); + return await ParseCopypartyListingAsync(baseUrl, httpClient, parsedWebDirectory, htmlDocument, html); } IHtmlCollection pres = htmlDocument.QuerySelectorAll("pre"); @@ -443,49 +444,9 @@ await foreach (string source in sources) return parsedWebDirectory; } - private static WebDirectory ParseCopypartyListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument) + private static async Task ParseCopypartyListingAsync(string baseUrl, HttpClient httpClient, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, string html) { - IElement table = htmlDocument.QuerySelector("table#files"); - - parsedWebDirectory.ParsedSuccessfully = true; - - IHtmlCollection entries = table.QuerySelectorAll("tbody tr"); - - foreach (IElement entry in entries) - { - IHtmlAnchorElement link = entry.QuerySelector("td:nth-child(2) a") as IHtmlAnchorElement; - IHtmlTableCellElement fileSize = entry.QuerySelector("td:nth-child(3)") as IHtmlTableCellElement; - - bool isDirectory = link.TextContent.EndsWith("/"); - - if (link is not null) - { - ProcessUrl(baseUrl, link, out _, out _, out string fullUrl); - - if (isDirectory) - { - string directoryName = link.TextContent.TrimEnd('/'); - - parsedWebDirectory.Subdirectories.Add(new WebDirectory(parsedWebDirectory) - { - Parser = "ParseCopypartyListing", - Url = fullUrl, - Name = directoryName - }); - } - else - { - parsedWebDirectory.Files.Add(new WebFile - { - Url = fullUrl, - FileName = Path.GetFileName(WebUtility.UrlDecode(fullUrl.Split('?')[0])), - FileSize = FileSizeHelper.ParseFileSize(fileSize.TextContent) - }); - } - } - } - - return parsedWebDirectory; + return await Copyparty.ParseIndex(baseUrl, httpClient, parsedWebDirectory, htmlDocument, html); } private static WebDirectory ParseDirLIST(string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, IHtmlCollection tables) diff --git a/src/OpenDirectoryDownloader/Site/Copyparty/CopypartyParser.cs b/src/OpenDirectoryDownloader/Site/Copyparty/CopypartyParser.cs new file mode 100644 index 00000000..d135d663 --- /dev/null +++ b/src/OpenDirectoryDownloader/Site/Copyparty/CopypartyParser.cs @@ -0,0 +1,154 @@ +using AngleSharp.Dom; +using AngleSharp.Html.Dom; +using OpenDirectoryDownloader.Helpers; +using OpenDirectoryDownloader.Shared.Models; +using System.Net; +using System.Text.RegularExpressions; + +namespace OpenDirectoryDownloader.Site.Copyparty; + +/// +/// Similar to GoIndex +/// +public static class Copyparty +{ + private const string Parser = "Copyparty"; + private static readonly Regex JsListingRegex = new("ls0\\s?=\\s?(?.*);$", RegexOptions.Multiline); + + public static async Task ParseIndex(string baseUrl, HttpClient httpClient, WebDirectory webDirectory, IHtmlDocument htmlDocument, string html) + { + try + { + webDirectory = await ScanAsync(baseUrl, httpClient, webDirectory, htmlDocument, html); + } + catch (Exception ex) + { + Program.Logger.Error(ex, "Error parsing {parser} for '{url}'", Parser, webDirectory.Url); + webDirectory.Error = true; + + OpenDirectoryIndexer.Session.Errors++; + + if (!OpenDirectoryIndexer.Session.UrlsWithErrors.Contains(webDirectory.Url)) + { + OpenDirectoryIndexer.Session.UrlsWithErrors.Add(webDirectory.Url); + } + + throw; + } + + return webDirectory; + } + + private static async Task ScanAsync(string baseUrl, HttpClient httpClient, WebDirectory webDirectory, IHtmlDocument htmlDocument, string html) + { + Program.Logger.Debug("Processing listings for '{url}'", webDirectory.Uri); + + webDirectory.Parser = Parser; + + try + { + IElement table = htmlDocument.QuerySelector("table#files"); + + IHtmlCollection entries = table.QuerySelectorAll("tbody tr"); + + if (entries.Any()) + { + foreach (IElement entry in entries) + { + IHtmlAnchorElement link = entry.QuerySelector("td:nth-child(2) a") as IHtmlAnchorElement; + IHtmlTableCellElement fileSize = entry.QuerySelector("td:nth-child(3)") as IHtmlTableCellElement; + + bool isDirectory = link.TextContent.EndsWith("/"); + + if (link is not null) + { + Library.ProcessUrl(baseUrl, link, out _, out _, out string fullUrl); + + if (isDirectory) + { + string directoryName = link.TextContent.TrimEnd('/'); + + webDirectory.Subdirectories.Add(new WebDirectory(webDirectory) + { + Parser = Parser, + Url = fullUrl, + Name = directoryName + }); + } + else + { + webDirectory.Files.Add(new WebFile + { + Url = fullUrl, + FileName = Path.GetFileName(WebUtility.UrlDecode(fullUrl.Split('?')[0])), + FileSize = FileSizeHelper.ParseFileSize(fileSize.TextContent) + }); + } + } + } + + webDirectory.ParsedSuccessfully = true; + } + else + { + return ParseCopypartyJavaScriptListing(baseUrl, webDirectory, htmlDocument, html); + } + + return webDirectory; + } + catch (Exception ex) + { + Program.Logger.Error(ex, "Error processing {parser} for '{url}'", Parser, webDirectory.Url); + webDirectory.Error = true; + + OpenDirectoryIndexer.Session.Errors++; + + if (!OpenDirectoryIndexer.Session.UrlsWithErrors.Contains(webDirectory.Url)) + { + OpenDirectoryIndexer.Session.UrlsWithErrors.Add(webDirectory.Url); + } + + //throw; + } + + return webDirectory; + } + + private static WebDirectory ParseCopypartyJavaScriptListing(string baseUrl, WebDirectory parsedWebDirectory, IHtmlDocument htmlDocument, string html) + { + Match jsListingRegexMatch = JsListingRegex.Match(html); + + if (!jsListingRegexMatch.Success) + { + return parsedWebDirectory; + } + + CopypartyListing copypartyListing = CopypartyListing.FromJson(jsListingRegexMatch.Groups["Listing"].Value); + + Uri baseUri = new(baseUrl); + + foreach (Dir dir in copypartyListing.Dirs) + { + parsedWebDirectory.Subdirectories.Add(new WebDirectory(parsedWebDirectory) + { + Parser = Parser, + Url = new Uri(baseUri, dir.Href).ToString(), + Name = dir.Name.TrimEnd('/') + }); + } + + foreach (Dir file in copypartyListing.Files) + { + parsedWebDirectory.Files.Add(new WebFile + { + Url = new Uri(baseUri, file.Href).ToString(), + FileName = file.Name, + FileSize = file.Sz + }); + } + + parsedWebDirectory.ParsedSuccessfully = true; + + return parsedWebDirectory; + } +} diff --git a/src/OpenDirectoryDownloader/Site/Copyparty/CopypartyResult.cs b/src/OpenDirectoryDownloader/Site/Copyparty/CopypartyResult.cs new file mode 100644 index 00000000..c905836a --- /dev/null +++ b/src/OpenDirectoryDownloader/Site/Copyparty/CopypartyResult.cs @@ -0,0 +1,71 @@ +using Newtonsoft.Json; +using Newtonsoft.Json.Converters; +using System.Globalization; + +namespace OpenDirectoryDownloader.Site.Copyparty; + +public partial class CopypartyListing +{ + [JsonProperty("dirs")] + public Dir[] Dirs { get; set; } + + [JsonProperty("files")] + public Dir[] Files { get; set; } + + [JsonProperty("taglist")] + public object[] Taglist { get; set; } +} + +public partial class Dir +{ + [JsonProperty("dt")] + public DateTimeOffset Dt { get; set; } + + [JsonProperty("ext")] + public string Ext { get; set; } + + [JsonProperty("href")] + public string Href { get; set; } + + [JsonProperty("lead")] + public string Lead { get; set; } + + [JsonProperty("name")] + public string Name { get; set; } + + [JsonProperty("sz")] + public long Sz { get; set; } + + [JsonProperty("tags")] + public Tags Tags { get; set; } + + [JsonProperty("ts")] + public long Ts { get; set; } +} + +public partial class Tags +{ +} + +public partial class CopypartyListing +{ + public static CopypartyListing FromJson(string json) => JsonConvert.DeserializeObject(json, Converter.Settings); +} + +public static class Serialize +{ + public static string ToJson(this CopypartyListing self) => JsonConvert.SerializeObject(self, Converter.Settings); +} + +internal static class Converter +{ + public static readonly JsonSerializerSettings Settings = new JsonSerializerSettings + { + MetadataPropertyHandling = MetadataPropertyHandling.Ignore, + DateParseHandling = DateParseHandling.None, + Converters = + { + new IsoDateTimeConverter { DateTimeStyles = DateTimeStyles.AssumeUniversal } + }, + }; +}