In [2]:
import requests
import re
from time import sleep
from typing import List, Tuple, Union
from datetime import datetime

from bs4 import BeautifulSoup
import mysql.connector
from mysql.connector.errors import IntegrityError
from tqdm.notebook import tqdm

# API tests

In [3]:
class MAL_API:

    USERS_ANIMELIST_URL = 'https://api.myanimelist.net/v2/users/{}/animelist?fields=list_status&limit=100'
    ANIME_DETAILS_URL = (
        'https://api.myanimelist.net/v2/anime/{}?fields=id,title,start_date,end_date,mean,rank,popularity,'
        'num_list_users,num_scoring_users,nsfw,media_type,status,genres,num_episodes,start_season,broadcast,'
        'source,average_episode_duration,rating,related_anime,recommendations,studios,statistics'
    )
    # Lists with keys from animelist dictionaries of variables to be extracted.
    list_entry_node_keys = ['id', 'title']
    list_entry_list_status_keys = ['status', 'score', 'num_episodes_watched', 'updated_at']
    def __init__(self, client_id: str) -> None:
        """Constructor that returns a MAL_API object with CLIENT_ID defined in headers
           and convenience functions for making calls to the API.

        Args:
            client_id (str): Authorization token.
        """
        self.CLIENT_ID = client_id
        self.api_headers = {
            'X-MAL-CLIENT-ID' : self.CLIENT_ID
        }

    # TODO thrash this - not used
    def get_anime_ranking(self, request_parameters='?ranking_type=all') -> dict:
        """_summary_

        Args:
            request_parameters (str, optional): _description_. Defaults to '?ranking_type=all'.

        Returns:
            dict: _description_
        """
        response = requests.get('https://api.myanimelist.net/v2/anime/ranking' + request_parameters, headers=self.api_headers)
        return response.json()
    
    def __parse_list_entry(self, entry: dict) -> dict:
        """Parse an entry of animelist. Select important variables and merge dictionaries.

        Args:
            entry (dict): Anime entry in a animelist from the API.

        Returns:
            dict: Parsed entry of animelist as a single dictionary.
        """
        node = entry['node']
        node_parsed = {key: node[key] for key in self.list_entry_node_keys}
        list_status = entry['list_status']
        list_status_parsed = {key: list_status[key] for key in self.list_entry_list_status_keys}
        # Not all entries have start and/or finished dates. Users have not started or finished some of the anime on their lists.
        # If they don't exist add them with `None` values.
        if 'start_date' in list_status.keys():
            list_status_parsed['start_date'] = list_status['start_date']
        else:
            list_status_parsed['start_date'] = None
        if 'finish_date' in list_status.keys():
            list_status_parsed['finish_date'] = list_status['finish_date']
        else:
            list_status_parsed['finish_date'] = None
        parsed_animelist = {**node_parsed, **list_status_parsed}
        return parsed_animelist

    def get_users_animelist(self, username: str) -> list:
        """Gets an animelist of a user by theirs username from the MAL API.

        Args:
            username (str): Username of the user, whose animelist will be retrieved.

        Returns:
            list: Each element of the list is a single anime entry in the animelist.
                  A single anime entry is a dictionary with selected values of the entry.
        """
        # End condition response['paging'] - no 'next' key in dictionary.
        animelist_url = self.USERS_ANIMELIST_URL.format(username)
        animelist = []
        end_of_list = False
        while not end_of_list:
            animelist_page = requests.get(animelist_url, headers=self.api_headers).json()
            # Check special cases.
            if 'error' in animelist_page.keys():
                if animelist_page['error'] == 'not_permitted':
                    end_of_list = True
                elif animelist_page['error'] == 'not_found':
                    end_of_list = True
            else:
                # Add page to animelist list.
                animelist.extend([self.__parse_list_entry(anime_entry) for anime_entry in animelist_page['data']])
                # Change url to next page. Checking for end condition.
                paging = animelist_page['paging']
                if 'next' in paging.keys():
                    animelist_url = paging['next']
                else:
                    end_of_list = True
        return animelist
    
    def get_anime_details(self, anime_id: int) -> dict:
        """Makes a call to the API and retrieves details about an anime by its ID.

        Args:
            anime_id (int): ID of the anime in MAL DB to be retrieved.

        Returns:
            dict: Details of an anime.
        """
        return requests.get(self.ANIME_DETAILS_URL.format(anime_id), headers=self.api_headers).json()
        

CLIENT_ID = '32df078edb7cabcd2eb77f026384e66b'
mal_api = MAL_API(client_id=CLIENT_ID)

In [7]:
class AnimeDatabase:
    """First populate the User table. Then populate the List_Entry table.
        If an anime is not present in the db, when adding an entry to List_Entry table first add anime to db.
    """

    USERS_PAGE_URL = 'https://myanimelist.net/users.php'
    ANIME_TABLE_FIELDS = [
        'id', 'title', 'start_date', 'end_date', 'mean_score', 'anime_rank',
        'popularity', 'num_list_users', 'num_scoring_users', 'nsfw',
        'media_type', 'status', 'num_episodes', 'season_year', 'season',
        'broadcast_day_of_week', 'broadcast_start_time', 'source',
        'average_episode_duration', 'rating', 'status_watching',
        'status_completed', 'status_on_hold', 'status_dropped',
        'status_plan_to_watch'
    ]

    def __init__(self, host: str, user: str, password: str, database: str, mal_api: MAL_API) -> None:
        """Construct an object with a database connection object and cursor for querying the db.

        Args:
            host (str): Database host.
            user (str): Database username.
            password (str): Database user password.
            database (str): Database name.
            mal_api (MAL_API): An object of MAL_API class. Used for retrieving anime details when inserting
                                an entry of users animelist and when an anime is not yet present in db.
        """
        self.mal_api = mal_api
        self.db_conn = mysql.connector.connect(host=host,
                                                  user=user,
                                                  password=password,
                                                  database=database)
        self.cursor = self.db_conn.cursor(buffered=True)

    def execute_queries(self, queries: List[str], queries_params: List[dict]) -> None:
        """Convenience function for running one or more queries. The queries are committed once all have passed.

        Args:
            queries (List[str]): List of queries to execute.
            queries_params (List[dict]): List of queries params to execute together with queries.
                                         Must be the same length as queries.

        Raises:
            ValueError: Error is raised if the lengths of two passed lists do not match.
        """
        if len(queries) != len(queries_params):
            raise ValueError('The lengths of `queries` and `queries_params` do not match.')
        for query, query_params in zip(queries, queries_params):
            if query_params:
                self.cursor.execute(query, query_params)
            else:
                self.cursor.execute(query)
        self.db_conn.commit()
    
    def insert_user(self, username: str) -> None:
        """Insert user to User table in DB.

        Args:
            username (str): Username of the user in MAL DB.
        """
        query = 'INSERT INTO User (username) VALUES (%(username)s);'
        self.execute_queries(queries=[query], queries_params=[{'username': username}])
    
    def insert_users_animelist_entry(self, user_id: int, list_entry: dict) -> None:
        """Inserts an entry of users animelist to `List_Entry` table.

        Args:
            user_id (int): ID of user in the database.
            list_entry (dict): List entry data in the format returned by MAL API: https://myanimelist.net/apiconfig/references/api/v2#operation/users_user_id_animelist_get.
        """
        query_values = list_entry
        # title not needed for this table.
        query_values.pop('title')
        # Change anime_id key to comply with table filed name.
        query_values['anime_id'] = query_values.pop('id')
        query_values['user_id'] = user_id
        # Check if anime already in db, if not, first add it.
        self.cursor.execute(f'SELECT * FROM Anime WHERE id = {query_values["anime_id"]};')
        result = self.cursor.fetchone()
        if not result:
            self.insert_anime(anime_details=self.mal_api.get_anime_details(list_entry['anime_id']))

        query = (
            'INSERT INTO List_Entry (anime_id, user_id, status, score,'
            'num_episodes_watched, updated_at, start_date, finish_date) '
            'VALUES (%(anime_id)s, %(user_id)s, %(status)s, %(score)s,'
            '%(num_episodes_watched)s, %(updated_at)s, %(start_date)s, %(finish_date)s);'
        )
        self.execute_queries(queries=[query], queries_params=[query_values])

    def get_queries_insert_studios(self, studios: List[dict]) -> Tuple[List[str], dict]:
        """Checks if studios are already in the db. If not returns an insert query for each studio not present in the db.

        Args:
            studios (List[dict]): List of dictionaries with studios, with keys `id` and `name`.

        Returns:
            Tuple[List[str], dict]: Insert queries and query parameters for each query.
        """
        queries = []
        queries_params = []
        for studio in studios:
            # Check if studio exists.
            self.cursor.execute(f'SELECT * FROM Studio WHERE id = {studio["id"]};')
            result = self.cursor.fetchone()
            if not result:
                # If not add insert query and studios parameters.
                queries.append('INSERT INTO Studio (id, name) VALUES (%(id)s, %(name)s);')
                queries_params.append(studio)
        return queries, queries_params

    def get_queries_insert_anime_studios(self, anime_id: int, studios: List[dict]) -> Tuple[List[str], dict]:
        """Assuming that anime and all studios from arguments are already present in the db.
           For each studio add query together with parameters that will insert the relationships between Anime and Studio
           to Anime_Studio table.

        Args:
            anime_id (int): id of the Anime.
            studios (List[dict]): List of studios that are related to the Anime. Each dictionary element in the format of MAL API.

        Returns:
            Tuple[List[str], dict]: Insert queries and query parameters for each query.
        """
        queries = []
        queries_params = []
        for studio in studios:
            queries.append('INSERT INTO Anime_Studio (anime_id, studio_id) VALUES (%(anime_id)s, %(studio_id)s);')
            # Create a dictionary with keys compliant with db table.
            queries_params.append({
                'anime_id': anime_id,
                'studio_id': studio['id']
            })
        return queries, queries_params
    
    def get_queries_insert_genres(self, genres: List[dict]) -> Tuple[List[str], dict]:
        """Checks if genres are already in the db. If not returns an insert query for each genre not present in the db.

        Args:
            genres (List[dict]): List of dictionaries with genres, with keys `id` and `name`.

        Returns:
            Tuple[List[str], dict]: Insert queries and query parameters for each query.
        """
        queries = []
        queries_params = []
        for genre in genres:
            # Check if genre exists.
            self.cursor.execute(f'SELECT * FROM Genre WHERE id = {genre["id"]};')
            result = self.cursor.fetchone()
            if not result:
                # If not add insert query and genre parameters.
                queries.append('INSERT INTO Genre (id, name) VALUES (%(id)s, %(name)s);')
                queries_params.append(genre)
        return queries, queries_params

    def get_queries_insert_anime_genres(self, anime_id: int, genres: List[dict]) -> Tuple[List[str], dict]:
        """Assuming that anime and all genres from arguments are already present in the db.
           For each genre add query together with parameters that will insert the relationships between Anime and Genre
           to Anime_Genre table.

        Args:
            anime_id (int): id of the Anime.
            genres (List[dict]): List of genres of the Anime. Each dictionary element in the format of MAL API.

        Returns:
            Tuple[List[str], dict]: Insert queries and query parameters for each query.
        """
        queries = []
        queries_params = []
        for genre in genres:
            queries.append('INSERT INTO Anime_Genre (anime_id, genre_id) VALUES (%(anime_id)s, %(genre_id)s);')
            # Create a dictionary with keys compliant with db table.
            queries_params.append({
                'anime_id': anime_id,
                'genre_id': genre['id']
            })
        return queries, queries_params

    def get_queries_insert_related_anime(self, anime_id: int, related_anime: List[dict]) -> Tuple[List[str], dict]:
        """Generates and returns queries for inserting anime related to anime supplied in `anime_id`.

        Args:
            anime_id (int): id of the anime other anime are related to.
            related_anime (List[dict]): Properties of the related anime. Format as in MAL API.

        Returns:
            Tuple[List[str], dict]: Insert queries and query parameters for each query.
        """
        queries = []
        queries_params = []
        for ra in related_anime:
            relation_prequel = 1 if ra['relation_type'] == 'prequel' else 0
            queries.append(
                'INSERT INTO Related_Anime (anime_id, related_anime_id, relation_prequel) '
                'VALUES (%(anime_id)s, %(related_anime_id)s, %(relation_prequel)s);'
            )
            queries_params.append({
                'anime_id': anime_id,
                'related_anime_id': ra['node']['id'],
                'relation_prequel': relation_prequel
            })
        return queries, queries_params
    
    def get_queries_insert_mal_recommendations(self, anime_id: int, recommendations: List[dict]) -> Tuple[List[str], dict]:
        """Generates insert queries for inserting anime recommended to anime specified by `anime_id` to MAL_Anime_Recommendation table.

        Args:
            anime_id (int): id of anime other anime are recommended to.
            recommendations (List[dict]): Properties of recommended anime. Format as in MAL API.

        Returns:
            Tuple[List[str], dict]: Insert queries and query parameters for each query.
        """
        queries = []
        queries_params = []
        for recommendation in recommendations:
            queries.append(
                'INSERT INTO MAL_Anime_Recommendation (anime_id, recommended_anime_id, num_recommendations) '
                'VALUES (%(anime_id)s, %(recommended_anime_id)s, %(num_recommendations)s);'
            )
            queries_params.append({
                'anime_id': anime_id,
                'recommended_anime_id': recommendation['node']['id'],
                'num_recommendations': recommendation['num_recommendations']
            })
        return queries, queries_params

    def validate_and_parse_date(self, date: str) -> Union[None, str]:
        """Validates if date in string format matches expected format and returns the `date` argument if it does or `None` otherwise.

        Args:
            date (str): Date to be validated and parsed.

        Returns:
            Union[None, str]: `date` argument if it matches the expected format and `None` otherwise or when `None` is passed.
        """
        if date is None:
            return None
        else:
            try:
                datetime.strptime(date, '%Y-%m-%d')
            except ValueError:
                # If time format is not as specified return None to be inserted into db.
                return None
        return date

    def insert_anime(self, anime_details: dict) -> None:
        """First generate queries as strings (together with queries params) in an appropriate order to ensure
           no foreign key constraints issues. Second pass those to the `execute_queries` method.

        Args:
            anime_details (dict): Details of an anime as in MAL API.
        """
        queries = []
        queries_params = []
        
        # season is unknown for some anime.
        if 'start_season' in anime_details.keys():
            season = anime_details.pop('start_season')
        else:
            season = {
                'year': None,
                'season': None
            }
        anime_details['season_year'] = season['year']
        anime_details['season'] = season['season']

        # broadcast is unknown for some anime.
        if 'broadcast' in anime_details.keys():
            broadcast = anime_details.pop('broadcast')
        else:
            broadcast = {
                'day_of_the_week': None,
                'start_time': None
            }
        anime_details['broadcast_day_of_week'] = broadcast['day_of_the_week']
        anime_details['broadcast_start_time'] = broadcast['start_time']
        
        lists_statuses = anime_details.pop('statistics')
        anime_details.update({'status_' + key: val for key, val in lists_statuses['status'].items()})
        
        # Some anime do not have one or more of these fields.
        anime_details['end_date'] = self.validate_and_parse_date(anime_details.get('end_date'))
        anime_details['start_date'] = self.validate_and_parse_date(anime_details.get('start_date'))
        anime_details['anime_rank'] = anime_details.get('rank')
        anime_details['mean_score'] = anime_details.get('mean')
        anime_details['rating'] = anime_details.get('rating')

        anime_fields = {key: val for key, val in anime_details.items() if key in self.ANIME_TABLE_FIELDS}
        anime_id = anime_fields['id']
        # Insert Anime queries.
        anime_query = 'INSERT INTO Anime (' + ', '.join(self.ANIME_TABLE_FIELDS) + ') VALUES (%(' + ')s, %('.join(self.ANIME_TABLE_FIELDS) + ')s);'
        queries.append(anime_query)
        queries_params.append(anime_fields)


        studios = anime_details['studios']
        # Insert studios queries, if studio not in db.
        studio_queries, studio_queries_params = self.get_queries_insert_studios(studios=studios)
        # Insert anime_studio relationships.
        anime_studio_queries, anime_studio_queries_params = self.get_queries_insert_anime_studios(anime_id=anime_id,
                                                                                                  studios=studios)
        genres = anime_details['genres']
        # Insert genres queries, if genre not in db.
        genre_queries, qenre_queries_params = self.get_queries_insert_genres(genres=genres)
        # Insert anime_genre relationships.
        anime_genre_queries, anime_genre_queries_params = self.get_queries_insert_anime_genres(anime_id=anime_id,
                                                                                               genres=genres)
        # Insert related_anime queries.
        related_anime_queries, related_anime_queries_params = self.get_queries_insert_related_anime(anime_id=anime_id,
                                                                                                    related_anime=anime_details['related_anime'])
        # Insert MAL_anime_recommendation queries.
        recommended_anime_queries, recommended_anime_queries_params = self.get_queries_insert_mal_recommendations(anime_id=anime_id,
                                                                                                                 recommendations=anime_details['recommendations'])
        # Add all queries in the appropriate order of execution to lists.
        queries.extend(studio_queries)
        queries.extend(anime_studio_queries)
        queries.extend(genre_queries)
        queries.extend(anime_genre_queries)
        queries.extend(related_anime_queries)
        queries.extend(recommended_anime_queries)
        queries_params.extend(studio_queries_params)
        queries_params.extend(anime_studio_queries_params)
        queries_params.extend(qenre_queries_params)
        queries_params.extend(anime_genre_queries_params)
        queries_params.extend(related_anime_queries_params)
        queries_params.extend(recommended_anime_queries_params)
        # Finally execute the queries.
        self.execute_queries(queries=queries, queries_params=queries_params)
        
    def populate_users_animelist(self, username: str) -> None:
        """Populate animelist of `username` by adding entries to List_Entry table.

        Args:
            username (str): Name of the users animelist of whose is to be populated in the db. 
        """
        users_anime_list = self.mal_api.get_users_animelist(username=username)
        for entry in tqdm(users_anime_list, desc='Populating users animelist', leave=False):
            self.cursor.execute(f'SELECT id FROM User WHERE username = \'{username}\';')
            user_id = self.cursor.fetchone()[0]
            try:
                self.insert_users_animelist_entry(user_id=user_id, list_entry=entry)
            except IntegrityError:
                # If entry is already in the db just pass.
                # Makes it easier to rerun the same user in the event of a failure.
                pass

    def populate_users(self, n_pages: int) -> None:
        """Retrieves username from MAL users.php page. Each page contains 20 users and the number
           of pages to be retrieved is given as the argument `n_pages`.

        Args:
            n_pages (int): Number of get request to users.php page to retrieve.
        """
        for i in tqdm(range(n_pages), desc='Populating Users table'):
            sleep(1)
            users_soup = BeautifulSoup(requests.get(self.USERS_PAGE_URL).text, 'html.parser')
            user_names = [user_soup.find('a').get_text() for user_soup in users_soup.find('td').find_all('td')]
            for username in user_names:
                self.insert_user(username)

In [8]:
ani_db.db_conn.close()
ani_db = AnimeDatabase(host='localhost', user='root', password='Poloi!@#888', database='Anime', mal_api=mal_api)

In [11]:
ani_db.populate_users_animelist(username='tajbinshawon')

Populating users animelist:   0%|          | 0/103 [00:00<?, ?it/s]

In [10]:
ani_db.populate_users(n_pages=3)

Populating Users table:   0%|          | 0/3 [00:00<?, ?it/s]

In [55]:
mal_api.get_users_animelist(username='Wellingtonv7')[25]

{'id': 54714,
 'title': 'Kimi no Koto ga Daidaidaidaidaisuki na 100-nin no Kanojo',
 'status': 'plan_to_watch',
 'score': 0,
 'num_episodes_watched': 0,
 'updated_at': '2023-04-03T01:07:34+00:00',
 'start_date': None,
 'finish_date': None}

In [60]:
mal_api.get_anime_details(3912)

{'id': 3912,
 'title': 'Rejuvenation',
 'main_picture': {'medium': 'https://cdn.myanimelist.net/images/anime/1737/112618.jpg',
  'large': 'https://cdn.myanimelist.net/images/anime/1737/112618l.jpg'},
 'start_date': '2000-01-28',
 'end_date': '2000-02-25',
 'mean': 5.22,
 'popularity': 11108,
 'num_list_users': 1686,
 'num_scoring_users': 533,
 'nsfw': 'gray',
 'media_type': 'ova',
 'status': 'finished_airing',
 'genres': [{'id': 12, 'name': 'Hentai'}],
 'num_episodes': 2,
 'start_season': {'year': 2000, 'season': 'winter'},
 'average_episode_duration': 1620,
 'rating': 'rx',
 'related_anime': [],
 'recommendations': [],
 'studios': [{'id': 177, 'name': 'Studio Kyuuma'}],
 'statistics': {'status': {'watching': '136',
   'completed': '817',
   'on_hold': '73',
   'dropped': '111',
   'plan_to_watch': '548'},
  'num_list_users': 1685}}

In [22]:
# TODO tutaj

datetime.strptime(None, '%Y-%m-%d')

TypeError: strptime() argument 1 must be str, not None

In [21]:
datetime.strptime('2018', '%Y-%m-%d')

ValueError: time data '2018' does not match format '%Y-%m-%d'

In [6]:
datetime.strptime('2018-10-10', '%Y-%m-%d')

datetime.datetime(2018, 10, 10, 0, 0)

In [15]:
mal_api.get_anime_details(5315)

{'id': 5315,
 'title': 'Ero Manga Mitai na Koi Shiyo',
 'main_picture': {'medium': 'https://cdn.myanimelist.net/images/anime/11/85126.jpg',
  'large': 'https://cdn.myanimelist.net/images/anime/11/85126l.jpg'},
 'start_date': '2008-06-27',
 'end_date': '2008-09-26',
 'mean': 6.85,
 'popularity': 6854,
 'num_list_users': 7267,
 'num_scoring_users': 2615,
 'nsfw': 'gray',
 'media_type': 'ova',
 'status': 'finished_airing',
 'genres': [{'id': 12, 'name': 'Hentai'}],
 'num_episodes': 2,
 'start_season': {'year': 2008, 'season': 'spring'},
 'source': 'manga',
 'average_episode_duration': 1652,
 'rating': 'rx',
 'related_anime': [],
 'recommendations': [],
 'studios': [{'id': 1531, 'name': 'Flavors Soft'}],
 'statistics': {'status': {'watching': '601',
   'completed': '3815',
   'on_hold': '355',
   'dropped': '279',
   'plan_to_watch': '2216'},
  'num_list_users': 7266}}

# Scraping

I was considering supplying usernames with all members from top 100 anime. However a list of members of an anime only contains 7500 users, that have recently updated a this anime on their list. Therefore I will only use the random sample from the `users.php` page of recently active users.

## Scraping usernames by users page with recently active users

In [20]:

user_names_filename = 'user_names.txt'

with open(user_names_filename, 'a') as usernames_file:
    # TODO add loop with number of usernames to scrape
    users_soup = BeautifulSoup(requests.get(users_page_url).text, 'html.parser')
    user_names = [user_soup.find('a').get_text() for user_soup in users_soup.find('td').find_all('td')]
    for username in user_names:
        print(username, file=usernames_file)

## Collecting users anime lists