# Capstone Project Data Science (mdavap)
# Import Library

In [1]:
import sqlalchemy as db 
import pandas as pd
import os
import json
from dotenv import load_dotenv
from datetime import datetime

load_dotenv()

True

# Connect to database

In [2]:
DB_URL = os.getenv('DB_URL')
engine = db.create_engine(DB_URL)

# Import our dataset

In [3]:
with open('../dataset/raw/anime_jikan.json', 'r') as f:
    raw_data = json.load(f)

# Create tables
- anime
    - id (primary key) (integer)
    - mal_id (integer)
    - title (string)
    - title_english (string)
    - type (string)
    - source (string)
    - episodes (integer)
    - status (string)
    - airing (boolean)
    - rating (string)
    - score (float)
    - rank (integer)
    - scored_by (integer)
    - popularity (integer)
    - favorites (integer)
    - members (integer)
    - season (string)
    - year (integer)
    - start (string) (datetime)
    - ending (string) (datetime)
    - synopsis (string)

In [4]:
metadata_obj = db.MetaData() 

anime_table = db.Table( 
    'anime',
    metadata_obj,
    db.Column('id', db.Integer, primary_key=True),   
    db.Column('mal_id', db.Integer),
    db.Column('title', db.Text),
    db.Column('title_english', db.Text),
    db.Column('type', db.Text),
    db.Column('source', db.Text),
    db.Column('episodes', db.Integer),
    db.Column('status', db.Text),
    db.Column('airing', db.Boolean),
    db.Column('rating', db.Text),
    db.Column('score', db.Float),
    db.Column('rank', db.Integer),
    db.Column('scored_by', db.Integer),
    db.Column('popularity', db.Integer),
    db.Column('favorites', db.Integer),
    db.Column('members', db.Integer),
    db.Column('seasons', db.Text),
    db.Column('year', db.Integer),
    db.Column('start', db.DateTime),
    db.Column('ending', db.DateTime),
    db.Column('synopsis', db.Text)
) 

- genre_name
    - id (primary key) (integer)
    - name (string)

- studio_name
    - id (primary key) (integer)
    - name (string)

In [5]:
genre_name_table = db.Table( 
    'genre_name',
    metadata_obj,
    db.Column('id', db.Integer, primary_key=True),   
    db.Column('name', db.Text),
)

studio_name_table = db.Table( 
    'studio_name',
    metadata_obj,
    db.Column('id', db.Integer, primary_key=True),   
    db.Column('name', db.Text),
)

- genres
    - id (primary key) (integer)
    - anime_id (foreign key -> anime.id) (integer)
    - genre_id (foreign key -> genre_name.id) (integer)

- studios
    - id (primary key) (integer)
    - anime_id (foreign key -> anime.id) (integer)
    - studio_id (foreign key -> studio_name.id) (integer)

In [6]:
genres_table = db.Table( 
    'genres',
    metadata_obj,
    db.Column('id', db.Integer, primary_key=True),  
    db.Column('anime_id', db.Integer, db.ForeignKey('anime.id')),   
    db.Column('genre_id', db.Integer, db.ForeignKey('genre_name.id')),   
)

studios_table = db.Table( 
    'studios',
    metadata_obj,
    db.Column('id', db.Integer, primary_key=True),  
    db.Column('anime_id', db.Integer, db.ForeignKey('anime.id')),   
    db.Column('studio_id', db.Integer, db.ForeignKey('studio_name.id')),   
)

In [7]:
metadata_obj.create_all(engine)

# Exporting dataset to db

In [8]:
genre_list = set()
studio_list = set()

anime_genres = {}
anime_studios = {}

total_anime = len(raw_data)
last_ranking = total_anime

print(f'Total anime: {len(raw_data)}')

for anime in raw_data:
    anime_info = {
        'mal_id': anime['mal_id'],
        'title': anime['title'],
        'title_english': anime['title_english'] if anime['title_english'] else db.null(),
        'type': anime['type'] if anime['type'] else db.null(),
        'source': anime['source'],
        'episodes': anime['episodes'] if anime['episodes'] else 0,
        'status': anime['status'],
        'airing': anime['airing'],
        'rating': anime['rating'] if anime['rating'] else 'no_rating',
        'score': anime['score'] if anime['score'] else 0,
        'rank': anime['rank'],
        'scored_by': anime['scored_by'] if anime['scored_by'] else 0,
        'popularity': anime['popularity'] if anime['popularity'] else 0,
        'favorites': anime['favorites'] if anime['favorites'] else 0,
        'members': anime['members'] if anime['members'] else 0,
        'seasons': anime['season'] if anime['season'] else '',
        'year': anime['year'] if anime['year'] else anime['aired']['prop']['from']['year'], # Year is none? get from aired
        'start': anime['aired']['from'] if anime['aired']['from'] else '',
        'ending': anime['aired']['to'] if anime['aired']['to'] else '',
        'synopsis': anime['synopsis'] if anime['synopsis'] else '',
    }

    if not anime_info['year']:
        anime_info['year'] = db.null()

    if not anime_info['rank']:
        anime_info['rank'] = last_ranking
        last_ranking += 1

    # ISO to datetime type
    if anime_info['start']:
        anime_info['start'] = datetime.fromisoformat(anime_info['start'])
    else:
        anime_info['start'] = db.null()

    if anime_info['ending']:
        anime_info['ending'] = datetime.fromisoformat(anime_info['ending'])
    else:
        anime_info['ending'] = db.null()

    anime_studio_list = [studio['name'] for studio in anime['studios']]
    anime_genre_list = [studio['name'] for studio in anime['genres']]

    for studio in anime_studio_list:
        studio_list.add(studio)

    for genre in anime_genre_list:
        genre_list.add(genre)

    insert_statement = anime_table.insert().values(**anime_info).returning(anime_table.c.id)
    with engine.connect() as conn:
        result = conn.execute(insert_statement)
        id = result.scalars().all()[0]
        conn.commit()
        
        anime_genres[id] = anime_genre_list
        anime_studios[id] = anime_studio_list

print(f'Added anime: {len(raw_data)}')

Total anime: 27970
Added anime: 27970


In [9]:
len(anime_studios)

27970

In [10]:
studio_list

{'TYMOTE',
 'Success Corp.',
 'Pmats9 studio',
 'Khara',
 'Ekura Animal',
 'Larx Entertainment',
 'Sunny Side Up',
 'Group TAC',
 'Higashinaka Studio',
 'Vega Entertainment',
 'Kachigarasu',
 'Taikong Works',
 'Bibury Animation Studios',
 'LAN Studio',
 'Arms',
 'IKK Room',
 'Studio OX',
 'Studio Coa',
 'C&S Production',
 'Robot Communications',
 'J.C.F.',
 'Creatures Inc.',
 'Lide',
 'Frontier One',
 'studio NAGURI',
 '2:10 Animation',
 'Chaos Project',
 'Tokyo Kids',
 '10Gauge',
 'Strawberry Meets Pictures',
 'Iyasakadou Film',
 'Manglobe',
 'Studio Dolphin Night',
 'Movic',
 'Joker Films',
 'Schoolzone',
 'Aurum Production',
 'Shindeban Film',
 'KWANED',
 'Shuka',
 'Annapuru',
 'D & D Pictures',
 'Infinity Vision',
 'Valkyria',
 'Studio DURIAN',
 'Magia Doraglier',
 'Ice Butter',
 "Rock'n Roll Mountain",
 'TCJ',
 '1IN',
 'Barnum Studio',
 'Ginga Teikoku',
 'Animation 501',
 'Ijigen Tokyo',
 'BeSTACK',
 'Wako Productions',
 'Cutie Bee',
 'Image House',
 'Ishikawa Pro',
 'Studio Harut

In [11]:
genre_list

{'Action',
 'Adventure',
 'Avant Garde',
 'Award Winning',
 'Boys Love',
 'Comedy',
 'Drama',
 'Ecchi',
 'Erotica',
 'Fantasy',
 'Girls Love',
 'Gourmet',
 'Hentai',
 'Horror',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Slice of Life',
 'Sports',
 'Supernatural',
 'Suspense'}

# Adding genres and studios to db

In [12]:
genre_to_be_added = []

for genre in genre_list:
    genre_to_be_added.append({
        'name': genre
    })

with engine.connect() as conn:
    insert_genre_statement = genre_name_table.insert().returning(genre_name_table.c.id)
    result = conn.execute(insert_genre_statement, genre_to_be_added)
    id_genre = result.scalars().all()
    conn.commit()
    print('Genre list has been added to db!')

Genre list has been added to db!


In [13]:
studio_to_be_added = []

for studio in studio_list:
    studio_to_be_added.append({
        'name': studio
    })

with engine.connect() as conn:
    insert_studio_statement = studio_name_table.insert().returning(studio_name_table.c.id)
    result = conn.execute(insert_studio_statement, studio_to_be_added)
    id_studio = result.scalars().all()
    conn.commit()
    print('Studio list has been added to db!')

Studio list has been added to db!


# Adding genre and studio relation to anime

In [14]:
genre_list = list(genre_list)
studio_list = list(studio_list)

In [15]:
genre_relations = []

for id, list in anime_genres.items():
    for genre in list:
        genre_index = id_genre[genre_list.index(genre)]
        genre_relations.append({
            'anime_id': id,
            'genre_id': genre_index
        })

with engine.connect() as conn:
    insert_relation_statement = genres_table.insert()
    conn.execute(insert_relation_statement, genre_relations)
    conn.commit()
    print('All genre relation has been added to db!')

All genre relation has been added to db!


In [16]:
studio_relations = []

for id, list in anime_studios.items():
    for studio in list:
        studio_index = id_studio[studio_list.index(studio)]
        studio_relations.append({
            'anime_id': id,
            'studio_id': studio_index
        })

with engine.connect() as conn:
    insert_relation_statement = studios_table.insert()
    conn.execute(insert_relation_statement, studio_relations)
    conn.commit()
    print('All studio relation has been added to db!')

All studio relation has been added to db!
