In [72]:
import requests
import logging
import json
import os
from bs4 import BeautifulSoup
from dataclasses import dataclass
from configuration import (
    WIKIPEDIA_HOST_URL,
    WIKIPEDIA_SEARCH_API
)

LOGGER_BASENAME = 'wikisearch'
LOGGER = logging.getLogger(LOGGER_BASENAME)
LOGGER.addHandler(logging.NullHandler())

@dataclass
class SearchResult:
    title: str
    url: str

class LoggerMixin(object):
    def __init__(self) -> None:
        self._logger = logging.getLogger(f'{LOGGER_BASENAME}.{self.__class__.__name__}')

class WikipediaSeries(LoggerMixin):
    def __init__(self) -> None:
        super().__init__()
        self.search_url = WIKIPEDIA_SEARCH_API
        self.seasons = []
        self.title = None

    def __str__(self):
        return f'series seasons: {self.seasons}'
    
    def _get_query_map(self, name):
        query_map = {
            'episode_list': f'list of {name} episodes',
            'miniseries': f'{name} miniseries',
            'name': f'{name}'
        }
        return query_map

    def search_by_name(self, name):
        for type, query in self._get_query_map(name).items():
            self._logger.debug(f'Searching for {name} with type:{type}')
            result = self._search(query)
            if result:
                if len(result) == 1:
                    self.title = name
                return result
    
    def _search(self, query):
        parameters = {'action': 'opensearch',
                            'format': 'json',
                            'formatversion': '2',
                            'search': query}

        response = requests.get(self.search_url, params=parameters)
        if response.ok:
            return [SearchResult(*args) for args in zip(response.json()[1], response.json()[3])]
        else:
            self._logger.error(f'Request failed with code {response.code} and message {response.text}')
        
    
    def get_soup_by_url(self, url):
        html_response = requests.get(url)
        soup = BeautifulSoup(html_response.text, 'html.parser')
        return soup
    
    def parse_seasons_from_soup(self, soup):
        season_list = []
        table = soup.find("table", {"class": "wikitable plainrowheaders"})
        t_headers = table.find_all("th")
        for header in t_headers:
            season = header.find("a")
            if season:
                season_list.append(season.contents[0])
        return season_list

    def parse_seasons_and_episodes_from_soup(self, soup):
        season_list = []
        tables = soup.find_all("table", {"class": "wikitable plainrowheaders wikiepisodetable"})
        for table in tables:
            episode_list = []
            season_header = table.find_previous_sibling('h3')
            season_title = season_header.find("span", {"class": "mw-headline"}).contents[0]
            season = Season(season_title)
            episodes = table.find_all("tr", {"class": "vevent"})
            for episode in episodes:
                episode_number = episode.find("td").contents[0]
                episode_title = episode.find("td", {"class": "summary"}).find("a")
                if not episode_title:
                    episode_title = episode.find("td", {"class": "summary"}).contents[0]
                else:
                    episode_title = episode_title.contents[0]
                episode_list.append(Episode(episode_title, episode_number))
            season.episodes = episode_list
            season_list.append(season)
        self.seasons = season_list

    def write_to_file_system(self):
        directory = os.path.dirname(f'./{self.title}')

        if not os.path.exists(directory):
            os.makedirs(directory)
            pass

class Season:
    
    def __init__(self, number) -> None:
        super().__init__()
        self.number = number
        self.episodes = []
    
    def get_episodes_json(self):
        episodes = []
        for episode in self.episodes:
            episodes.append(episode.__str__())
        return json.dumps(episodes)


class Episode:

    def __init__(self, title, number) -> None:
        super().__init__()
        self.title = title
        self.number = number

    def __str__(self):
        return f'episode:{self.number},  title:{self.title}'



In [50]:
test = WikipediaSeries()
result = test.search_by_name("Dexter")
print(result)
soup = test.get_soup_by_url(result[0].url)
#seasons = test.parse_seasons_from_soup(soup)
test.parse_seasons_and_episodes_from_soup(soup)

[SearchResult(title='List of Dexter episodes', url='https://en.wikipedia.org/wiki/List_of_Dexter_episodes')]


In [73]:
for season in test.seasons:
    print(season.number)
    directory = os.path.dirname(f'./results/{test.title}/{season.number}/')
    print(directory)
    if not os.path.exists(directory):
        os.makedirs(directory)
        with open(f'{directory}/episodes.json', 'w') as episodes_file: 
            episodes_file.write(season.get_episodes_json())
        #with open('data.json', 'w', encoding='utf-8') as f:
        #    json.dump(season.get_episodes_json(), f, ensure_ascii=False, indent=4)
        


Season 1 (2006)
./results/Dexter/Season 1 (2006)
Season 2 (2007)
./results/Dexter/Season 2 (2007)
Season 3 (2008)
./results/Dexter/Season 3 (2008)
Season 4 (2009)
./results/Dexter/Season 4 (2009)
Season 5 (2010)
./results/Dexter/Season 5 (2010)
Season 6 (2011)
./results/Dexter/Season 6 (2011)
Season 7 (2012)
./results/Dexter/Season 7 (2012)
Season 8 (2013)
./results/Dexter/Season 8 (2013)
Season 1 (2009–10)
./results/Dexter/Season 1 (2009–10)
Season 2: 
./results/Dexter/Season 2: 
Season 3: 
./results/Dexter/Season 3: 


In [48]:
#print(test)
directory = os.path.dirname(f'./{test.title')
for season in test.seasons:
    #print(season)
    print(season.get_episodes_json())
    #for episode in season.episodes:
    #    print(episode)
    print("---" *10)
    #print(season.__dict__)

["episode:1,  title:Dexter", "episode:2,  title:Crocodile", "episode:3,  title:Popping Cherry", "episode:4,  title:Let's Give the Boy a Hand", "episode:5,  title:Love American Style", "episode:6,  title:Return to Sender", "episode:7,  title:Circle of Friends", "episode:8,  title:\"Shrink Wrap\"", "episode:9,  title:Father Knows Best", "episode:10,  title:Seeing Red", "episode:11,  title:Truth Be Told", "episode:12,  title:Born Free"]
------------------------------
["episode:1,  title:It's Alive!", "episode:2,  title:Waiting to Exhale", "episode:3,  title:An Inconvenient Lie", "episode:4,  title:See-Through", "episode:5,  title:The Dark Defender", "episode:6,  title:Dex, Lies, and Videotape", "episode:7,  title:That Night, A Forest Grew", "episode:8,  title:Morning Comes", "episode:9,  title:Resistance Is Futile", "episode:10,  title:There's Something About Harry", "episode:11,  title:Left Turn Ahead", "episode:12,  title:The British Invasion"]
------------------------------
["episode:1

In [4]:
import coloredlogs
def setup_logging(level, config_file=None):
    """
    Sets up the logging.

    Needs the args to get the log level supplied

    Args:
        level: At which level do we log
        config_file: Configuration to use

    """
    # This will configure the logging, if the user has set a config file.
    # If there's no config file, logging will default to stdout.
    if config_file:
        # Get the config for the logger. Of course this needs exception
        # catching in case the file is not there and everything. Proper IO
        # handling is not shown here.
        try:
            with open(config_file) as conf_file:
                configuration = json.loads(conf_file.read())
                # Configure the logger
                logging.config.dictConfig(configuration)
        except ValueError:
            print(f'File "{config_file}" is not valid json, cannot continue.')
            raise SystemExit(1)
    else:
        coloredlogs.install(level=level.upper())

In [5]:
setup_logging("debug")

In [6]:
result = ['Ted Lasso', ['Ted Lasso', 'Ted Lawson', 'Tel Assor'], ['', '', ''], ['https://en.wikipedia.org/wiki/Ted_Lasso', 'https://en.wikipedia.org/wiki/Ted_Lawson', 'https://en.wikipedia.org/wiki/Tel_Assor']]

for item in zip(result[1], result[3]):
    print(SearchResult(*item))

SearchResult(title='Ted Lasso', url='https://en.wikipedia.org/wiki/Ted_Lasso')
SearchResult(title='Ted Lawson', url='https://en.wikipedia.org/wiki/Ted_Lawson')
SearchResult(title='Tel Assor', url='https://en.wikipedia.org/wiki/Tel_Assor')


In [42]:
print(result[0].url)
html_response = requests.get(result[0].url)
soup = BeautifulSoup(html_response.text, 'html.parser')
test = soup.find_all("span", {"class": "mw-headline"})
for soup_result in test:
    if soup_result.string == "Series overview":
        print(soup_result.contents)
        table = soup_result.find_next("table", {"class": "wikitable plainrowheaders"})
        seasons = table.find_all("th")
        for season in seasons:
            print(season.contents)
            theseason = season.find("a")
            if theseason:
                print(theseason.contents)

2021-02-26 15:55:05 sbpltg1b3hv2j urllib3.connectionpool[54960] DEBUG Starting new HTTPS connection (1): en.wikipedia.org:443
https://en.wikipedia.org/wiki/List_of_Game_of_Thrones_episodes
2021-02-26 15:55:05 sbpltg1b3hv2j urllib3.connectionpool[54960] DEBUG https://en.wikipedia.org:443 "GET /wiki/List_of_Game_of_Thrones_episodes HTTP/1.1" 200 51883
['Series overview']
['Season']
['Episodes']
['Originally aired']
[<abbr title="Average">Avg.</abbr>, ' U.S. viewers', <br/>, '(millions)']
['First aired']
['Last aired']
[<a href="#Season_1_(2011)">1</a>]
['1']
[<a href="#Season_2_(2012)">2</a>]
['2']
[<a href="#Season_3_(2013)">3</a>]
['3']
[<a href="#Season_4_(2014)">4</a>]
['4']
[<a href="#Season_5_(2015)">5</a>]
['5']
[<a href="#Season_6_(2016)">6</a>]
['6']
[<a href="#Season_7_(2017)">7</a>]
['7']
[<a href="#Season_8_(2019)">8</a>]
['8']


In [30]:
print(seasons)

[<a href="#Season_1_(2006)">1</a>, <a href="#Season_2_(2007)">2</a>, <a href="#Season_3_(2008)">3</a>, <a href="#Season_4_(2009)">4</a>, <a href="#Season_5_(2010)">5</a>, <a href="#Season_6_(2011)">6</a>, <a href="#Season_7_(2012)">7</a>, <a href="#Season_8_(2013)">8</a>]


In [47]:
html_response = requests.get(result[0].url)
soup = BeautifulSoup(html_response.text, 'html.parser')
table = soup.find("table", {"class": "wikitable plainrowheaders"})
seasons = table.find_all("th")
for season in seasons:
    print(season.contents)
    theseason = season.find("a")
    if theseason:
        print(theseason.contents)

2021-02-26 15:59:25 sbpltg1b3hv2j urllib3.connectionpool[54960] DEBUG Starting new HTTPS connection (1): en.wikipedia.org:443
2021-02-26 15:59:25 sbpltg1b3hv2j urllib3.connectionpool[54960] DEBUG https://en.wikipedia.org:443 "GET /wiki/List_of_Dexter_episodes HTTP/1.1" 200 43265
['Season']
['Episodes']
['Originally aired']
['First aired']
['Last aired']
[<a href="#Season_1_(2006)">1</a>]
['1']
[<a href="#Season_2_(2007)">2</a>]
['2']
[<a href="#Season_3_(2008)">3</a>]
['3']
[<a href="#Season_4_(2009)">4</a>]
['4']
[<a href="#Season_5_(2010)">5</a>]
['5']
[<a href="#Season_6_(2011)">6</a>]
['6']
[<a href="#Season_7_(2012)">7</a>]
['7']
[<a href="#Season_8_(2013)">8</a>]
['8']


In [44]:
print(table)

<table class="wikitable plainrowheaders" style="text-align:center"><tbody><tr style="text-align:center"><th colspan="2" rowspan="2" scope="col" style="min-width:50px;padding:0 8px">Season</th><th colspan="2" rowspan="2" scope="col" style="padding:0 8px">Episodes</th><th colspan="2" scope="col">Originally aired</th><th rowspan="2" scope="col" style="padding:0 8px"><abbr title="Average">Avg.</abbr> U.S. viewers<br/>(millions)</th></tr><tr><th scope="col">First aired</th><th scope="col">Last aired</th></tr><tr><td style="background:#295354;width:10px"></td><th colspan="1" scope="row" style="text-align:center"><a href="#Season_1_(2011)">1</a></th><td colspan="2">10</td><td colspan="1" style="padding:0.2em 0.4em">April 17, 2011<span style="display:none"> (<span class="bday dtstart published updated">2011-04-17</span>)</span></td><td style="padding:0 8px">June 19, 2011<span style="display:none"> (<span class="dtend">2011-06-19</span>)</span></td><td>2.52<sup class="reference" id="cite_ref-av