# Anime Scrapping ETL

## Imports

In [1]:
import requests

from bs4 import BeautifulSoup as bs

import pandas as pd
pd.set_option('display.max_columns', None)

import multiprocessing as mp

from tqdm.notebook import tqdm

import time

## CSV and Columns

In [2]:
anime = pd.read_csv('../data/anime.csv')

In [3]:
old_colnames = anime.columns
new_colnames = [name.lower().replace(' ', '_').replace('-','_') for name in old_colnames]

anime.rename(columns=dict(zip(old_colnames, new_colnames)), inplace=True)

del_cols = ['score_1','score_2','score_3','score_4','score_5','score_6','score_7','score_8','score_9','score_10','dropped','plan_to_watch','on_hold','completed','watching','favorites','popularity']

anime.drop(columns=del_cols, inplace=True)

## Wikipedia Function

In [4]:
anime.head(3)

Unnamed: 0,mal_id,name,score,genres,english_name,japanese_name,type,episodes,aired,premiered,producers,licensors,studios,source,duration,rating,ranked,members
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,1251960
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,273145
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,266.0,558913


In [5]:
def wiki_dir(title): # Entra una lista de titulos
    
    res = 'unknown'
    
    url_wiki = 'https://en.wikipedia.org/wiki/'
    
    url= url_wiki + title.replace(' ','_')
   
    # Intenta encontrar el link y parsearlo

    try:
        
        html = requests.get(url).content
        
        soup = bs(html, "html.parser")
        
        # Prueba class_='infobox'
        
        if str(soup.find('table', class_='infobox')) != 'None':
                                                                       
            sauce = soup.find('table', class_='infobox')

            noodle = sauce.find_all('tr')
            
        # Prueba class_='infobox infobox_v2'
            
        elif str(soup.find('table', class_='infobox infobox_v2')) != 'None': 
            
            sauce = soup.find('table', class_='infobox infobox_v2')

            noodle = sauce.find_all('tr')
        
        # No encuentra la tabla que buscamos
        
        else: 
            
            pass
        
        # Loop que busca el th 'directed by' en todas las filas de la tabla y devuelve el nombre del director.
        
        for row in noodle:

            if row.find('th') == None:
                
                continue
                
            elif row.find('th').text.lower().replace('\xa0','') == 'directedby':
                
                res = row.find('td').text
            
            else:
                
                continue
        
        # Si se completa el loop sin haber conseguido un director devuelve esto.
                
        return res
        
    except:
        
        return 'fallo'

In [6]:
from joblib import Parallel, delayed

In [7]:


paralelo = Parallel(n_jobs=4,  verbose=True)


anime['director'] = paralelo(delayed(wiki_dir)(title) for title in anime.english_name)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   23.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   47.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  3.6min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  4.7min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  7.3min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  8.6min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed: 10.1min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed: 13.6min
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed: 15.3min
[Parallel(n_jobs=4)]: Done 11242 tasks      | elapsed: 17.1mi

In [12]:
anime.director.unique()

array(['Shinichirō Watanabe', 'fallo', 'Satoshi Nishimura',
       'Shūkō Murase', 'Tatsuya Nagamine', 'Li Yun Chan', 'unknown',
       'Hiroshi Kōjina', 'Shinji Takamatsu', 'Takayuki Inagaki',
       'Hiroshi Hamasaki', 'Tomohiro Hirata', 'Yasunao Aoki',
       '\nHideaki Anno\nMasayuki (assistant)\nKazuya Tsurumaki (assistant)\n',
       'Kōichi Mashimo', 'Hiroaki Gohda', 'Kiyoko Sayama',
       'Hiroshi Nishikiori', 'Masami Shimoda', 'Koichi Ohata',
       'Morio Asaka', 'Yū Kō', 'Koji YoshikawaNobuyoshi Habara',
       'Iku Suzuki', 'Shinichi Omata[a]', 'Katsuichi Nakayama',
       'Takahiro Omori', 'Takayuki Hamana', 'Keizou Kusakawa',
       'Naoto Hosoda', 'Yoshikazu Yasuhiko', 'Yoshiyuki Tomino',
       'Masashi Ikeda', 'Mitsuo Fukuda', 'Yasuhiro Imagawa',
       'Yukio Takahashi', 'Masakazu Obara', 'Hatsuki Tsuji',
       'Hajime Kamegaki', 'Shigeyasu Yamauchi', 'Yuji Mutoh',
       'Takeo Takahashi', 'Tatsuo Satō', 'Hiroaki Sakurai',
       'Yoshihide Ibata', 'Nanako Shimazak

In [11]:
anime

Unnamed: 0,mal_id,name,score,genres,english_name,japanese_name,type,episodes,aired,premiered,producers,licensors,studios,source,duration,rating,ranked,members,director
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,Bandai Visual,"Funimation, Bandai Entertainment",Sunrise,Original,24 min. per ep.,R - 17+ (violence & profanity),28.0,1251960,Shinichirō Watanabe
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,"Sunrise, Bandai Visual",Sony Pictures Entertainment,Bones,Original,1 hr. 55 min.,R - 17+ (violence & profanity),159.0,273145,fallo
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,Victor Entertainment,"Funimation, Geneon Entertainment USA",Madhouse,Manga,24 min. per ep.,PG-13 - Teens 13 or older,266.0,558913,Satoshi Nishimura
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,"TV Tokyo, Bandai Visual, Dentsu, Victor Entert...","Funimation, Bandai Entertainment",Sunrise,Original,25 min. per ep.,PG-13 - Teens 13 or older,2481.0,94683,Shūkō Murase
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,"TV Tokyo, Dentsu",Unknown,Toei Animation,Manga,23 min. per ep.,PG - Children,3710.0,13224,Tatsuya Nagamine
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",Unknown,盗墓笔记之秦岭神树,ONA,Unknown,"Apr 4, 2021 to ?",Unknown,Unknown,Unknown,Unknown,Novel,Unknown,Unknown,Unknown,354,fallo
17558,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",Unknown,見える子ちゃん,TV,Unknown,2021 to ?,Unknown,Kadokawa,Unknown,Passione,Manga,Unknown,Unknown,Unknown,7010,fallo
17559,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Higurashi:When They Cry – SOTSU,ひぐらしのなく頃に卒,TV,Unknown,"Jul, 2021 to ?",Summer 2021,Unknown,Unknown,Unknown,Visual novel,Unknown,R - 17+ (violence & profanity),Unknown,11309,fallo
17560,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",Unknown,ヤマノススメ Next Summit,TV,Unknown,Unknown,Unknown,Kadokawa,Unknown,8bit,Manga,Unknown,PG-13 - Teens 13 or older,Unknown,1386,fallo


In [10]:
anime.to_csv('anime_2.csv', index=False)