In [12]:
import requests
from bs4 import BeautifulSoup
import re
import mysql.connector
from tqdm import tqdm_notebook as tqdm

# API tests

In [15]:
class MAL_API:

    USERS_ANIMELIST_URL = 'https://api.myanimelist.net/v2/users/{}/animelist?fields=list_status&limit=100'
    ANIME_DETAILS_URL = (
        'https://api.myanimelist.net/v2/anime/{}?fields=id,title,start_date,end_date,mean,rank,popularity,'
        'num_list_users,num_scoring_users,nsfw,media_type,status,genres,num_episodes,start_season,broadcast,'
        'source,average_episode_duration,rating,related_anime,recommendations,studios,statistics'
    )
    # Lists with keys from animelist dictionaries of variables to be extracted.
    list_entry_node_keys = ['id', 'title']
    list_entry_list_status_keys = ['status', 'score', 'num_episodes_watched', 'updated_at']
    def __init__(self, client_id: str) -> None:
        """Constructor that returns a MAL_API object with CLIENT_ID defined in headers
           and convenience functions for making calls to the API.

        Args:
            client_id (str): Authorization token.
        """
        self.CLIENT_ID = client_id
        self.api_headers = {
            'X-MAL-CLIENT-ID' : self.CLIENT_ID
        }

    # TODO thrash this - not used
    def get_anime_ranking(self, request_parameters='?ranking_type=all') -> dict:
        """_summary_

        Args:
            request_parameters (str, optional): _description_. Defaults to '?ranking_type=all'.

        Returns:
            dict: _description_
        """
        response = requests.get('https://api.myanimelist.net/v2/anime/ranking' + request_parameters, headers=self.api_headers)
        return response.json()
    
    def __parse_list_entry(self, entry: dict) -> dict:
        """Parse an entry of animelist. Select important variables and merge dictionaries.

        Args:
            entry (dict): Anime entry in a animelist from the API.

        Returns:
            dict: Parsed entry of animelist as a single dictionary.
        """
        node = entry['node']
        node_parsed = {key: node[key] for key in self.list_entry_node_keys}
        list_status = entry['list_status']
        list_status_parsed = {key: list_status[key] for key in self.list_entry_list_status_keys}
        # Not all entries have start and/or finished dates. Users have not started or finished some of the anime on their lists.
        # If they don't exist add them with `None` values.
        if 'start_date' in list_status.keys():
            list_status_parsed['start_date'] = list_status['start_date']
        else:
            list_status_parsed['start_date'] = None
        if 'finish_date' in list_status.keys():
            list_status_parsed['finish_date'] = list_status['finish_date']
        else:
            list_status_parsed['finish_date'] = None
        parsed_animelist = {**node_parsed, **list_status_parsed}
        return parsed_animelist

    def get_users_animelist(self, username: str) -> list:
        """Gets an animelist of a user by theirs username from the MAL API.

        Args:
            username (str): Username of the user, whose animelist will be retrieved.

        Returns:
            list: Each element of the list is a single anime entry in the animelist.
                  A single anime entry is a dictionary with selected values of the entry.
        """
        # End condition response['paging'] - no 'next' key in dictionary.
        animelist_url = self.USERS_ANIMELIST_URL.format(username)
        animelist = []
        end_of_list = False
        while not end_of_list:
            animelist_page = requests.get(animelist_url, headers=self.api_headers).json()
            # Add page to animelist list.
            animelist.extend([self.__parse_list_entry(anime_entry) for anime_entry in animelist_page['data']])
            # Change url to next page. Checking for end condition.
            paging = animelist_page['paging']
            if 'next' in paging.keys():
                animelist_url = paging['next']
            else:
                end_of_list = True
        return animelist
    
    def get_anime_details(self, anime_id: int) -> dict:
        """Makes a call to the API and retrieves details about an anime by its ID.

        Args:
            anime_id (int): ID of the anime in MAL DB to be retrieved.

        Returns:
            dict: Details of an anime.
        """
        return requests.get(self.ANIME_DETAILS_URL.format(anime_id), headers=self.api_headers).json()
        

CLIENT_ID = '32df078edb7cabcd2eb77f026384e66b'
mal_api = MAL_API(client_id=CLIENT_ID)

In [37]:
class AnimeDatabase:

    USERS_PAGE_URL = 'https://myanimelist.net/users.php'

    def __init__(self, host: str, user: str, password: str, database: str) -> None:
        """Construct an object with a database connection object and cursor for querying the db.

        Args:
            host (str): Database host.
            user (str): Database username.
            password (str): Database user password.
            database (str): Database name.
        """
        self.db_conn = mysql.connector.connect(host=host,
                                                  user=user,
                                                  password=password,
                                                  database=database)
        self.cursor = self.db_conn.cursor(buffered=True)

    def prepare_str_for_query(self, arg: str) -> str:
        """Prepares 

        Args:
            arg (str): _description_

        Returns:
            str: _description_
        """
        if arg is None:
            return 'NULL'
        else:
            return '\'' + arg + '\''
    
    def insert_user(self, username: str) -> None:
        """Insert user to User table in DB.

        Args:
            username (str): Username of the user in MAL DB.
        """
        self.cursor.execute(f'INSERT INTO User (username) VALUES ({username});')
        self.db_conn.commit()
    
    def populate_users(self, n_pages: int) -> None:
        """Retrieves username from MAL users.php page. Each page contains 20 users and the number
           of pages to be retrieved is given as the argument `n_pages`.

        Args:
            n_pages (int): Number of get request to users.php page to retrieve.
        """
        for i in tqdm(range(n_pages)):
            users_soup = BeautifulSoup(requests.get(self.USERS_PAGE_URL).text, 'html.parser')
            user_names = [user_soup.find('a').get_text() for user_soup in users_soup.find('td').find_all('td')]
            for username in user_names:
                self.insert_user(username)
    
    def insert_users_animelist_entry(self, user_id: int, list_entry: dict) -> None:
        # TODO docstring
        query_values = list_entry
        query_values.pop('title')
        # Change anime_id key to comply with table filed name.
        # query_values['anime_id'] = query_values.pop('id')
        query_values['user_id'] = user_id
        # anime_id = list_entry['id']
        # status = list_entry['status']
        # score = list_entry['score']
        # num_episodes_watched = list_entry['num_episodes_watched']
        # updated_at = list_entry['updated_at']
        # , start_date, finish_date
        self.cursor.execute('INSERT INTO List_Entry (anime_id, user_id, status, score,'
                            'num_episodes_watched, updated_at, start_date, finish_date) '
                            'VALUES (%(anime_id)s, %(user_id)s, %(status)s, %(score)s'
                            '%(num_episodes_watched)s, %(updated_at)s, %(start_date)s, %(finish_date)s);',
                            params=query_values)
        self.db_conn.commit()
    # TODO insert anime and anime details. commit once all passed

In [18]:
TET = mal_api.get_users_animelist('mateuszvaper')
TET

[{'id': 52034,
  'title': '"Oshi no Ko"',
  'status': 'plan_to_watch',
  'score': 0,
  'num_episodes_watched': 0,
  'updated_at': '2023-05-03T19:07:18+00:00',
  'start_date': None,
  'finish_date': None},
 {'id': 38101,
  'title': '5-toubun no Hanayome',
  'status': 'completed',
  'score': 7,
  'num_episodes_watched': 12,
  'updated_at': '2021-09-18T17:13:52+00:00',
  'start_date': '2021-09-12',
  'finish_date': '2021-09-18'},
 {'id': 39783,
  'title': '5-toubun no Hanayome ∬',
  'status': 'completed',
  'score': 6,
  'num_episodes_watched': 12,
  'updated_at': '2021-09-26T19:12:28+00:00',
  'start_date': '2021-09-18',
  'finish_date': '2021-09-26'},
 {'id': 48548,
  'title': '5-toubun no Hanayome Movie',
  'status': 'completed',
  'score': 3,
  'num_episodes_watched': 1,
  'updated_at': '2023-04-16T18:36:04+00:00',
  'start_date': None,
  'finish_date': '2023-04-16'},
 {'id': 41457,
  'title': '86',
  'status': 'completed',
  'score': 7,
  'num_episodes_watched': 11,
  'updated_at': '

In [38]:
TET[0]
anime_db = AnimeDatabase(host='localhost', user='root', password='Poloi!@#888', database='Anime')
anime_db.insert_users_animelist_entry(user_id=69, list_entry=TET[0])

DataError: 1136 (21S01): Column count doesn't match value count at row 1

# Scraping

I was considering supplying usernames with all members from top 100 anime. However a list of members of an anime only contains 7500 users, that have recently updated a this anime on their list. Therefore I will only use the random sample from the `users.php` page of recently active users.

## Scraping usernames by users page with recently active users

In [20]:

user_names_filename = 'user_names.txt'

with open(user_names_filename, 'a') as usernames_file:
    # TODO add loop with number of usernames to scrape
    users_soup = BeautifulSoup(requests.get(users_page_url).text, 'html.parser')
    user_names = [user_soup.find('a').get_text() for user_soup in users_soup.find('td').find_all('td')]
    for username in user_names:
        print(username, file=usernames_file)

## Collecting users anime lists

In [12]:
mal_api.get_users_animelist('mateuszvaper')

[{'id': 52034,
  'title': '"Oshi no Ko"',
  'status': 'plan_to_watch',
  'score': 0,
  'num_episodes_watched': 0,
  'updated_at': '2023-05-03T19:07:18+00:00',
  'start_date': None,
  'finish_date': None},
 {'id': 38101,
  'title': '5-toubun no Hanayome',
  'status': 'completed',
  'score': 7,
  'num_episodes_watched': 12,
  'updated_at': '2021-09-18T17:13:52+00:00',
  'start_date': '2021-09-12',
  'finish_date': '2021-09-18'},
 {'id': 39783,
  'title': '5-toubun no Hanayome ∬',
  'status': 'completed',
  'score': 6,
  'num_episodes_watched': 12,
  'updated_at': '2021-09-26T19:12:28+00:00',
  'start_date': '2021-09-18',
  'finish_date': '2021-09-26'},
 {'id': 48548,
  'title': '5-toubun no Hanayome Movie',
  'status': 'completed',
  'score': 3,
  'num_episodes_watched': 1,
  'updated_at': '2023-04-16T18:36:04+00:00',
  'start_date': None,
  'finish_date': '2023-04-16'},
 {'id': 41457,
  'title': '86',
  'status': 'completed',
  'score': 7,
  'num_episodes_watched': 11,
  'updated_at': '