Skip to content

Commit

Permalink
feat: Prepare for central MovieScrapeJob class
Browse files Browse the repository at this point in the history
Split movie scrapers into scrape jobs.  That way we can run them in
parallel, have better preview windows, etc.

This commit does not implement them, but prepares the file structure.
I do this to avoid future changes in the result-parsing, that may be
accidentally reverted once this feature is implemented (due to my
feature branch not being up to date or having merge conflicts).
  • Loading branch information
bugwelle committed Feb 7, 2023
1 parent 3d6d9d3 commit ebc6a53
Show file tree
Hide file tree
Showing 41 changed files with 1,582 additions and 949 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ third_party/packaging_win

# Deployment
create-dmg/
tmp/

# OS specific
.directory
Expand Down
22 changes: 22 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
# Changelog

## 2.10.2 - tbd

### Notes

-

### Fixed

-

### Changed

-

### Added

-

### Removed

-

## 2.10.0 - Benzar (2023-02-05)

### Notes
Expand Down
14 changes: 14 additions & 0 deletions MediaElch.pro
Original file line number Diff line number Diff line change
Expand Up @@ -258,25 +258,32 @@ SOURCES += src/main.cpp \
src/scrapers/imdb/ImdbReferencePage.cpp \
src/scrapers/movie/adultdvdempire/AdultDvdEmpire.cpp \
src/scrapers/movie/adultdvdempire/AdultDvdEmpireApi.cpp \
src/scrapers/movie/adultdvdempire/AdultDvdEmpireScrapeJob.cpp \
src/scrapers/movie/adultdvdempire/AdultDvdEmpireSearchJob.cpp \
src/scrapers/movie/aebn/AEBN.cpp \
src/scrapers/movie/aebn/AebnApi.cpp \
src/scrapers/movie/aebn/AebnScrapeJob.cpp \
src/scrapers/movie/aebn/AebnSearchJob.cpp \
src/scrapers/movie/custom/CustomMovieScraper.cpp \
src/scrapers/movie/hotmovies/HotMovies.cpp \
src/scrapers/movie/hotmovies/HotMoviesApi.cpp \
src/scrapers/movie/hotmovies/HotMoviesScrapeJob.cpp \
src/scrapers/movie/hotmovies/HotMoviesSearchJob.cpp \
src/scrapers/movie/imdb/ImdbMovie.cpp \
src/scrapers/movie/imdb/ImdbMovieScraper.cpp \
src/scrapers/movie/imdb/ImdbMovieScrapeJob.cpp \
src/scrapers/movie/imdb/ImdbMovieSearchJob.cpp \
src/scrapers/movie/MovieIdentifier.cpp \
src/scrapers/movie/MovieScraper.cpp \
src/scrapers/movie/MovieSearchJob.cpp \
src/scrapers/movie/MovieScrapeJob.cpp \
src/scrapers/movie/tmdb/TmdbMovie.cpp \
src/scrapers/movie/tmdb/TmdbMovieSearchJob.cpp \
src/scrapers/movie/tmdb/TmdbMovieScrapeJob.cpp \
src/scrapers/movie/videobuster/VideoBuster.cpp \
src/scrapers/movie/videobuster/VideoBusterApi.cpp \
src/scrapers/movie/videobuster/VideoBusterSearchJob.cpp \
src/scrapers/movie/videobuster/VideoBusterScrapeJob.cpp \
src/scrapers/music/AllMusic.cpp \
src/scrapers/music/Discogs.cpp \
src/scrapers/music/MusicBrainz.cpp \
Expand Down Expand Up @@ -609,25 +616,32 @@ HEADERS += Version.h \
src/scrapers/imdb/ImdbReferencePage.h \
src/scrapers/movie/adultdvdempire/AdultDvdEmpire.h \
src/scrapers/movie/adultdvdempire/AdultDvdEmpireApi.h \
src/scrapers/movie/adultdvdempire/AdultDvdEmpireScrapeJob.h \
src/scrapers/movie/adultdvdempire/AdultDvdEmpireSearchJob.h \
src/scrapers/movie/aebn/AEBN.h \
src/scrapers/movie/aebn/AebnApi.h \
src/scrapers/movie/aebn/AebnSearchJob.h \
src/scrapers/movie/aebn/AebnScrapeJob.h \
src/scrapers/movie/custom/CustomMovieScraper.h \
src/scrapers/movie/hotmovies/HotMovies.h \
src/scrapers/movie/hotmovies/HotMoviesApi.h \
src/scrapers/movie/hotmovies/HotMoviesSearchJob.h \
src/scrapers/movie/hotmovies/HotMoviesScrapeJob.h \
src/scrapers/movie/imdb/ImdbMovie.h \
src/scrapers/movie/imdb/ImdbMovieScraper.h \
src/scrapers/movie/imdb/ImdbMovieSearchJob.h \
src/scrapers/movie/imdb/ImdbMovieScrapeJob.h \
src/scrapers/movie/MovieIdentifier.h \
src/scrapers/movie/MovieScraper.h \
src/scrapers/movie/MovieSearchJob.h \
src/scrapers/movie/MovieScrapeJob.h \
src/scrapers/movie/tmdb/TmdbMovie.h \
src/scrapers/movie/tmdb/TmdbMovieSearchJob.h \
src/scrapers/movie/tmdb/TmdbMovieScrapeJob.h \
src/scrapers/movie/videobuster/VideoBuster.h \
src/scrapers/movie/videobuster/VideoBusterApi.h \
src/scrapers/movie/videobuster/VideoBusterSearchJob.h \
src/scrapers/movie/videobuster/VideoBusterScrapeJob.h \
src/scrapers/music/AllMusic.h \
src/scrapers/music/Discogs.h \
src/scrapers/music/MusicBrainz.h \
Expand Down
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static void loadStylesheet(QApplication& app, const QString& theme, const QStrin
{
const QStringList availableStyles = QStyleFactory::keys();
QString filename;
qDebug() << "Using theme:" << theme;
qCDebug(generic) << "Using theme:" << theme;

#ifdef Q_OS_MAC
QString mainWindowTheme;
Expand Down
1 change: 0 additions & 1 deletion src/renamer/MovieRenamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,6 @@ MovieRenamer::RenameError MovieRenamer::renameMovie(Movie& movie)
QStringList newSubFiles;
bool hasCurrentNewName = false;
for (const QString& subFile : subtitle->files()) {
qDebug() << subFile;
QFileInfo subFi(fi.canonicalPath() + "/" + subFile);
QString newSubFileName = subFileName + "." + subFi.suffix();
if (subFile != newSubFileName) {
Expand Down
1 change: 1 addition & 0 deletions src/scrapers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ add_library(
concert/tmdb/TmdbConcertSearchJob.cpp
movie/MovieIdentifier.cpp
movie/MovieScraper.cpp
movie/MovieScrapeJob.cpp
movie/MovieSearchJob.cpp
music/AllMusic.cpp
music/Discogs.cpp
Expand Down
32 changes: 32 additions & 0 deletions src/scrapers/movie/MovieScrapeJob.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "scrapers/movie/MovieScrapeJob.h"

#include "data/movie/Movie.h"

namespace mediaelch {
namespace scraper {

MovieScrapeJob::MovieScrapeJob(MovieScrapeJob::Config config, QObject* parent) :
worker::Job(parent), m_movie{new Movie({}, this)}, m_config{std::move(config)}
{
// Wrapper to avoid static_assert calls.
connect(this, &Job::finished, this, [this]() { emit loadFinished(this, QPrivateSignal{}); });

// TODO: Change to true / remove once all usages of MovieScrapeJob are updated.
setAutoDelete(false);
}

const ScraperError& MovieScrapeJob::scraperError() const
{
return m_scraperError;
}

void MovieScrapeJob::setScraperError(ScraperError error)
{
m_scraperError = std::move(error);
setError(static_cast<int>(m_scraperError.error));
setErrorString(m_scraperError.message);
setErrorText(m_scraperError.technical);
}

} // namespace scraper
} // namespace mediaelch
66 changes: 66 additions & 0 deletions src/scrapers/movie/MovieScrapeJob.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#pragma once

#include "data/Locale.h"
#include "scrapers/ScraperError.h"
#include "scrapers/ScraperInfos.h"
#include "scrapers/movie/MovieIdentifier.h"
#include "utils/Meta.h"
#include "workers/Job.h"

#include <QObject>

class Movie;

namespace mediaelch {
namespace scraper {

/// \todo Currently not used properly; only used as a base for future changes.
class MovieScrapeJob : public worker::Job
{
Q_OBJECT

public:
/// \brief Configuration object for a movie scrape job.
struct Config
{
/// \brief A string that can be consumed by the movie scraper.
/// \details It is used to uniquely identify the movie. May be an IMDb ID in
/// string representation or an URL.
MovieIdentifier identifier;
/// \brief Language key for the scraper, e.g. "en-US", "de-DE", ...
Locale locale = Locale::English;
/// \brief movie details to be loaded using the scraper.
QSet<MovieScraperInfo> details;
};

public:
MovieScrapeJob(Config config, QObject* parent = nullptr);
~MovieScrapeJob() override = default;

public:
ELCH_NODISCARD Movie& movie() { return *m_movie; }
ELCH_NODISCARD const Movie& movie() const { return *m_movie; }

ELCH_NODISCARD const Config& config() const { return m_config; }
ELCH_NODISCARD const ScraperError& scraperError() const;

signals:
/// \brief Signal emitted when the scrape job has finished.
/// \details A simple wrapper around finished() to avoid static_asserts
/// from Job* to ShowSearchJob*.
/// Use hasError() and movie() to know whether the request was successful.
void loadFinished(mediaelch::scraper::MovieScrapeJob* scrapeJob, QPrivateSignal);

protected:
void setScraperError(ScraperError error);

protected:
Movie* m_movie = nullptr;

private:
const Config m_config;
ScraperError m_scraperError;
};

} // namespace scraper
} // namespace mediaelch
156 changes: 2 additions & 154 deletions src/scrapers/movie/adultdvdempire/AdultDvdEmpire.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
namespace mediaelch {
namespace scraper {

AdultDvdEmpire::AdultDvdEmpire(QObject* parent) : MovieScraper(parent)
AdultDvdEmpire::AdultDvdEmpire(QObject* parent) : MovieScraper(parent), m_scrapeJob(m_api, {}, nullptr)
{
m_meta.identifier = ID;
m_meta.name = "Adult DVD Empire";
Expand Down Expand Up @@ -92,159 +92,7 @@ void AdultDvdEmpire::loadData(QHash<MovieScraper*, mediaelch::scraper::MovieIden

void AdultDvdEmpire::parseAndAssignInfos(QString html, Movie* movie, QSet<MovieScraperInfo> infos)
{
using namespace std::chrono;

QTextDocument doc;
QRegularExpression rx;
rx.setPatternOptions(QRegularExpression::DotMatchesEverythingOption | QRegularExpression::InvertedGreedinessOption);
QRegularExpressionMatch match;

rx.setPattern("<h1>(.*)</h1>");
match = rx.match(html);
if (infos.contains(MovieScraperInfo::Title) && match.hasMatch()) {
doc.setHtml(match.captured(1).trimmed());
movie->setName(doc.toPlainText());
}

rx.setPattern("<small>Length: </small> ([0-9]*) hrs. ([0-9]*) mins.[\\s\\n]*</li>");
match = rx.match(html);
if (infos.contains(MovieScraperInfo::Runtime) && match.hasMatch()) {
minutes runtime = hours(match.captured(1).toInt()) + minutes(match.captured(2).toInt());
movie->setRuntime(runtime);
}

if (infos.contains(MovieScraperInfo::Released)) {
rx.setPattern("<li><small>Production Year:</small> ([0-9]{4})[\\s\\n]*</li>");
match = rx.match(html);
if (match.hasMatch()) {
movie->setReleased(QDate::fromString(match.captured(1), "yyyy"));
} else {
rx.setPattern(R"re(<li><small>Released:</small>\s+([A-Za-z]+) (\d{2} \d{4})[\s\n\r]*</li>)re");
match = rx.match(html);
if (match.hasMatch()) {
const QString dateStr = match.captured(2);
// Note: We can't use MMM because Qt < 6 is locale aware.
QDate date = QDate::fromString(dateStr, "dd yyyy");
const int month = helper::monthNameToInt(match.captured(1));
date.setDate(date.year(), month, date.day());
movie->setReleased(date);
}
}
}

rx.setPattern("<li><small>Studio: </small><a href=\"[^\"]*\"[\\s\\n]*Category=\"Item Page\"[\\s\\n]*Label=\"Studio "
"- Details\">(.*)[\\s\\n]*</a>");
match = rx.match(html);
if (infos.contains(MovieScraperInfo::Studios) && match.hasMatch()) {
doc.setHtml(match.captured(1));
movie->addStudio(doc.toPlainText().trimmed());
}

if (infos.contains(MovieScraperInfo::Actors)) {
// clear actors
movie->setActors({});

QTextDocument text;

// The Regex is "a bit" more complex because ADE has two HTML styles:
// One with images and one without. The second Regex line has an OR for this.
rx.setPattern(
R"re(<a href="(?:\/[a-zA-Z-]+)?\/\d+\/[^"]+"\r?\n\s+style="[^"]+"\r?\n\s+Category="Item Page" Label="Performer">)re"
R"re((?:(?:<div class="[^"]+"><u>([^<]+)</u>(?:<div[^>]+>)*<img src="([^"]+)")|(?:(?:\r?\n\t+)+(.+)</a>)))re");
rx.optimize();
QRegularExpressionMatchIterator matches = rx.globalMatch(html);
while (matches.hasNext()) {
QRegularExpressionMatch actorMatch = matches.next();
Actor a;
if (actorMatch.captured(1).isEmpty()) {
text.setHtml(actorMatch.captured(3).trimmed());
a.name = replaceEntities(text.toPlainText());
} else {
text.setHtml(actorMatch.captured(1).trimmed());
a.name = replaceEntities(text.toPlainText());
a.thumb = actorMatch.captured(2);
}
if (!a.name.isEmpty()) {
movie->addActor(a);
}
}
}

rx.setPattern(R"(<a href="/\d+/[^"]+"\r\n\s+Category="Item Page" Label="Director">([^<]+)</a>)");
match = rx.match(html);
if (infos.contains(MovieScraperInfo::Director) && match.hasMatch()) {
movie->setDirector(match.captured(1).trimmed());
}

// get the list of categories first (to avoid parsing categories of other movies)
rx.setPattern(R"(<strong>Categories:</strong>&nbsp;(.*)</div>)");
match = rx.match(html);
if (infos.contains(MovieScraperInfo::Genres) && match.hasMatch()) {
QString categoryHtml = match.captured(1);
rx.setPattern(R"(<a href="[^"]*"[\r\s\n]*Category="Item Page" Label="Category">([^<]*)</a>)");

QRegularExpressionMatchIterator matches = rx.globalMatch(categoryHtml);
while (matches.hasNext()) {
movie->addGenre(matches.next().captured(1).trimmed());
}
}

rx.setPattern(R"(<h4 class="m-b-0 text-dark synopsis">(<p( class="markdown-h[12]")?>.*)</p></h4>)");
match = rx.match(html);
if (infos.contains(MovieScraperInfo::Overview) && match.hasMatch()) {
// add some newlines to simulate the paragraphs (scene descriptions)
QString content{match.captured(1).trimmed()};
content.remove("<p class=\"markdown-h1\">");
content.remove("<p>");
content.replace("<p class=\"markdown-h2\">", "<br>");
content.replace("</p>", "<br>");
doc.setHtml(content);
movie->setOverview(doc.toPlainText());
if (Settings::instance()->usePlotForOutline()) {
movie->setOutline(doc.toPlainText());
}
}

rx.setPattern("href=\"([^\"]*)\"[\\s\\n]*id=\"front-cover\"");
match = rx.match(html);
if (infos.contains(MovieScraperInfo::Poster) && match.hasMatch()) {
Poster p;
p.thumbUrl = match.captured(1);
p.originalUrl = match.captured(1);
movie->images().addPoster(p);
}

rx.setPattern(R"(<a href="[^"]*"[\s\r\n]*Category="Item Page" Label="Series">[\s\r\n]*([^<]*)<span)");
match = rx.match(html);
if (infos.contains(MovieScraperInfo::Set) && match.hasMatch()) {
doc.setHtml(match.captured(1));
QString setName = doc.toPlainText().trimmed();
if (setName.endsWith("Series", Qt::CaseInsensitive)) {
setName.chop(6);
}
setName = setName.trimmed();
if (setName.startsWith("\"")) {
setName.remove(0, 1);
}
if (setName.endsWith("\"")) {
setName.chop(1);
}
MovieSet set;
set.name = setName.trimmed();
movie->setSet(set);
}

if (infos.contains(MovieScraperInfo::Backdrop)) {
rx.setPattern(R"re(<a rel="(scene)?screenshots"[\s\n]*href="([^"]*)")re");
QRegularExpressionMatchIterator matches = rx.globalMatch(html);
while (matches.hasNext()) {
QRegularExpressionMatch backDropMatch = matches.next();
Poster p;
p.thumbUrl = backDropMatch.captured(2);
p.originalUrl = backDropMatch.captured(2);
movie->images().addBackdrop(p);
}
}
m_scrapeJob.parseAndAssignInfos(html, movie, infos);
}

QString AdultDvdEmpire::replaceEntities(QString str) const
Expand Down
Loading

0 comments on commit ebc6a53

Please sign in to comment.