# Manga Data Science Project

In this Project I will scrape manga data from MyAnimeList to use in a future project


#### Import Modules

In [1]:
from requests import get 
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
from time import time
from time import sleep
from random import randint
import requests
from warnings import warn

#### Planning
The code for multiple functions that will extract and clean the data from the webpage will be defined and tested .

Eventually a class with be created will these methods




#### Get request

In [2]:
#get request of myanimelist to get html 
html = get('https://myanimelist.net/manga/1/')

####  Data Frame Structure
The data frame will use the column headings :

This data was obtained by looking at the webpage for MAL(MyAnimeList)

In [3]:
column_head = [
  
 'English',
 'Type',
 'Volumes',
 'Chapters',
 'Status',
 'Published',
 'Genres',
 'Themes',
 'Demographic',
 'Serialization',
 'Authors',
 'Score',
 'Ranked',
 'Popularity',
 'Members',
 'Favorites']

#### Create Beautiful Soup Object

In [4]:
soup = BeautifulSoup(html.text,'html.parser') #BS object

In [5]:
print(soup.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <link crossorigin="anonymous" href="//fonts.gstatic.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//fonts.googleapis.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//tags-cdn.deployads.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//www.googletagservices.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//www.googletagmanager.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//apis.google.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//pixel-sync.sitescout.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//pixel.tapad.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//c.deployads.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="//

#### Isolate the 'infomation' section of the page 
The information section that has the relevant statistics are the child of the class = 'spaceit_pad' 

In [6]:
info = soup.select('.spaceit_pad  ') #information section
info = [line.text for line in info] #grab the text 
info

['Japanese: MONSTER',
 'English: Monster',
 'Type: Manga',
 'Volumes: 18\n',
 'Chapters: 162\n',
 'Status: Finished',
 'Published: Dec  5, 1994 to Dec  20, 2001',
 '\nGenres:\nAward Winning\n\n                Award Winning,                    Drama\n\n                Drama,                    Mystery\n\n                Mystery ',
 '\nThemes:\nAdult Cast\n\n                Adult Cast,                    Psychological\n\n                Psychological ',
 '\nDemographic:\nSeinen\n\n                Seinen ',
 'Serialization:\nBig Comic Original',
 'Authors:\nUrasawa, Naoki (Story & Art)',
 'Score: 9.131 (scored by 7734777,347 users)1 indicates a weighted score.',
 " Ranked: #52 2 based on the top manga page. Please note that 'R18+' titles are excluded.",
 'Popularity: #30',
 'Members: 198,723',
 'Favorites: 17,367',
 '\nMain\n',
 '\nMain\n',
 '\nMain\n',
 '\nSupporting\n',
 '\nSupporting\n',
 '\nSupporting\n',
 '\nSupporting\n',
 '\nSupporting\n',
 '\nSupporting\n',
 '\nSupporting\n']

### Cleaning the data


#### Remove double whitespace and '\n'

In [7]:
info = [line.replace('  ',' ').replace('\n','').strip() for line in info]
info

['Japanese: MONSTER',
 'English: Monster',
 'Type: Manga',
 'Volumes: 18',
 'Chapters: 162',
 'Status: Finished',
 'Published: Dec 5, 1994 to Dec 20, 2001',
 'Genres:Award Winning        Award Winning,          Drama        Drama,          Mystery        Mystery',
 'Themes:Adult Cast        Adult Cast,          Psychological        Psychological',
 'Demographic:Seinen        Seinen',
 'Serialization:Big Comic Original',
 'Authors:Urasawa, Naoki (Story & Art)',
 'Score: 9.131 (scored by 7734777,347 users)1 indicates a weighted score.',
 "Ranked: #52 2 based on the top manga page. Please note that 'R18+' titles are excluded.",
 'Popularity: #30',
 'Members: 198,723',
 'Favorites: 17,367',
 'Main',
 'Main',
 'Main',
 'Supporting',
 'Supporting',
 'Supporting',
 'Supporting',
 'Supporting',
 'Supporting',
 'Supporting']

#### Remove columns that are not relevant and add columns that are missing

Any columns that are missing from the data will be added and have a value of 0

In [8]:
relevant_columns = tuple(column_head) #tuple of columns needed 

info = [line for line in info if line.startswith(relevant_columns)] #grab only info that is relevant

for i in range(len(relevant_columns)): # if the info is not in the list then add it with a zero value
    col = relevant_columns[i]
    if not info[i].startswith(col):
        word = relevant_columns[i] + ': 0'
        info.insert(i,word) 
info

['English: Monster',
 'Type: Manga',
 'Volumes: 18',
 'Chapters: 162',
 'Status: Finished',
 'Published: Dec 5, 1994 to Dec 20, 2001',
 'Genres:Award Winning        Award Winning,          Drama        Drama,          Mystery        Mystery',
 'Themes:Adult Cast        Adult Cast,          Psychological        Psychological',
 'Demographic:Seinen        Seinen',
 'Serialization:Big Comic Original',
 'Authors:Urasawa, Naoki (Story & Art)',
 'Score: 9.131 (scored by 7734777,347 users)1 indicates a weighted score.',
 "Ranked: #52 2 based on the top manga page. Please note that 'R18+' titles are excluded.",
 'Popularity: #30',
 'Members: 198,723',
 'Favorites: 17,367']

#### Get rid of multiple whitespaces

In [9]:
info = [re.sub('\s+',' ',line) for line in info] #spaces removed
info = [re.sub('^\s','',line) for line in info] #start space removed 
info

['English: Monster',
 'Type: Manga',
 'Volumes: 18',
 'Chapters: 162',
 'Status: Finished',
 'Published: Dec 5, 1994 to Dec 20, 2001',
 'Genres:Award Winning Award Winning, Drama Drama, Mystery Mystery',
 'Themes:Adult Cast Adult Cast, Psychological Psychological',
 'Demographic:Seinen Seinen',
 'Serialization:Big Comic Original',
 'Authors:Urasawa, Naoki (Story & Art)',
 'Score: 9.131 (scored by 7734777,347 users)1 indicates a weighted score.',
 "Ranked: #52 2 based on the top manga page. Please note that 'R18+' titles are excluded.",
 'Popularity: #30',
 'Members: 198,723',
 'Favorites: 17,367']

#### Remove duplicates words from Genre,Themes, and Demographics

In [10]:
#leave space after the colon to make string spliting easier later
for i in [6,7,8,9,10]:
    info[i] = info[i].replace(':',': ')
    
#Then get rid of duplicate Genre Themes and Demographic    
for i in [6,7,8]:
    info[i] = info[i].replace(',','') #Get rid of commas
    info[i] = ' '.join(dict.fromkeys(info[i].split())) 
info

['English: Monster',
 'Type: Manga',
 'Volumes: 18',
 'Chapters: 162',
 'Status: Finished',
 'Published: Dec 5, 1994 to Dec 20, 2001',
 'Genres: Award Winning Drama Mystery',
 'Themes: Adult Cast Psychological',
 'Demographic: Seinen',
 'Serialization: Big Comic Original',
 'Authors: Urasawa, Naoki (Story & Art)',
 'Score: 9.131 (scored by 7734777,347 users)1 indicates a weighted score.',
 "Ranked: #52 2 based on the top manga page. Please note that 'R18+' titles are excluded.",
 'Popularity: #30',
 'Members: 198,723',
 'Favorites: 17,367']

To get rid of duplicate words a dictionary is created with keys equal to the words in the string that have been seperated by a white space , then the dictionary keys are joined together using whitespace

#### Clean up the score column
The score that is seen from the html.text is not the correct score, the last digit of the float must be removed 

In [11]:
try:
    info[11] = (re.findall('.+\d+\.\d+',info[11])[0])[:-1] #remove the words from the score column
except: #If there is no score
    info[11] = 'Score: N/A'
info

['English: Monster',
 'Type: Manga',
 'Volumes: 18',
 'Chapters: 162',
 'Status: Finished',
 'Published: Dec 5, 1994 to Dec 20, 2001',
 'Genres: Award Winning Drama Mystery',
 'Themes: Adult Cast Psychological',
 'Demographic: Seinen',
 'Serialization: Big Comic Original',
 'Authors: Urasawa, Naoki (Story & Art)',
 'Score: 9.13',
 "Ranked: #52 2 based on the top manga page. Please note that 'R18+' titles are excluded.",
 'Popularity: #30',
 'Members: 198,723',
 'Favorites: 17,367']

#### Clean up the Ranked column

In [12]:
#remove words from ranked column and last digit 
try:
    info[12] = (re.findall('^Ranked:\s\D\d+',info[12]))[0][:-1]  
except: #if there is no rank
    info[12] = 'Ranked: N/A'
info




['English: Monster',
 'Type: Manga',
 'Volumes: 18',
 'Chapters: 162',
 'Status: Finished',
 'Published: Dec 5, 1994 to Dec 20, 2001',
 'Genres: Award Winning Drama Mystery',
 'Themes: Adult Cast Psychological',
 'Demographic: Seinen',
 'Serialization: Big Comic Original',
 'Authors: Urasawa, Naoki (Story & Art)',
 'Score: 9.13',
 'Ranked: #5',
 'Popularity: #30',
 'Members: 198,723',
 'Favorites: 17,367']

#### The Date has now been cleaned
The data must now be converted into a dictionary so it can be turned into a pandas Data Frame

In [13]:
add_row = {}  # dictionary of the row that will be added to DF

for line in info: #split keys and value with colon
    i = line.split(': ')
    add_row[i[0]] = i[1]

add_row


{'English': 'Monster',
 'Type': 'Manga',
 'Volumes': '18',
 'Chapters': '162',
 'Status': 'Finished',
 'Published': 'Dec 5, 1994 to Dec 20, 2001',
 'Genres': 'Award Winning Drama Mystery',
 'Themes': 'Adult Cast Psychological',
 'Demographic': 'Seinen',
 'Serialization': 'Big Comic Original',
 'Authors': 'Urasawa, Naoki (Story & Art)',
 'Score': '9.13',
 'Ranked': '#5',
 'Popularity': '#30',
 'Members': '198,723',
 'Favorites': '17,367'}

#### Show in Pandas Data Frame

In [14]:
df = pd.DataFrame( columns = column_head, dtype = str) #Create Empty Data frame with columns
df_new_row = pd.DataFrame([add_row]) # row that will be added 
df = pd.concat([df, df_new_row]) # concat the data frames

df

Unnamed: 0,English,Type,Volumes,Chapters,Status,Published,Genres,Themes,Demographic,Serialization,Authors,Score,Ranked,Popularity,Members,Favorites
0,Monster,Manga,18,162,Finished,"Dec 5, 1994 to Dec 20, 2001",Award Winning Drama Mystery,Adult Cast Psychological,Seinen,Big Comic Original,"Urasawa, Naoki (Story & Art)",9.13,#5,#30,198723,17367


### Automation
Now I have succesfully scraped the data of 1 webpage into the data frame , now to start the process of automating it for multiple pages

#### Monitering Request Time
If the request time is not monitered an IP ban might take place due to inhuman amounts of requests

In [15]:
start_time =time()
request = 0

for _ in range(5):
    request += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(request,request/elapsed_time))

Request: 1; Frequency: 0.4963345615756971 requests/s
Request: 2; Frequency: 0.3976621360740262 requests/s
Request: 3; Frequency: 0.42604245102720695 requests/s
Request: 4; Frequency: 0.39796982731576874 requests/s
Request: 5; Frequency: 0.38298721340931524 requests/s


#### Empty Data Frame
The Final Data Frame where the manga infomation will be stored

In [16]:
#Final Data Frame
df = pd.DataFrame( columns = column_head, dtype = str) #Create Empty Data frame with columns 

#### URL Codes List
The URL for manga is 'https://myanimelist.net/manga/' followed by a numeric code .I will scrape the first 10,000 codes

In [17]:
URL_NO = np.arange(0,2,1) #list from 1 to 5001

### Class Creation

Now I will take the previous data cleaning functions and create a class called AnimeData which will take the URL paramter and have methods to clean the data

In [18]:
class AnimeData:
    
    
    def __init__(self,link):
        self.html = get(link)
        self.info = BeautifulSoup((self.html).text,'html.parser')
        self.column_head = [
                      'English',
                     'Type',
                     'Volumes',
                     'Chapters',
                     'Status',
                     'Published',
                     'Genres',
                     'Themes',
                     'Demographic',
                     'Serialization',
                     'Authors',
                     'Score',
                     'Ranked',
                     'Popularity',
                     'Members',
                     'Favorites']
        
        self.relevant_columns = tuple(self.column_head)
        self.EmptyDataFrame = pd.DataFrame( columns = self.column_head, dtype = str) #Create Empty Data frame with columns 
        
        
    def RelevantInfo(self):
        '''Extract Infomation statistics from the webpage'''
        
        self.info = self.info.select('.spaceit_pad  ') #information section
        self.info = [line.text for line in self.info] #grab the text 
        
        
    def RemoveNewLine(self):
        self.info = [line.replace('  ',' ').replace('\n','').strip() for line in self.info]
        
        
    def RelevantColumns(self):
        '''Check if all the relevant statistics are there
        if not add the statistic with a value of 0'''
        
        relevant_columns = tuple(self.column_head) 

        self.info = [line for line in self.info if line.startswith(relevant_columns)] #grab only info that is relevant

        for i in range(len(relevant_columns)): # if the info is not in the list then add it with a zero value
            col = str(relevant_columns[i])
            if not self.info[i].startswith(col):
                word = relevant_columns[i] + ': 0'
                self.info.insert(i,word) 
        

    def RemoveWhitespace(self):
        self.info = [re.sub('\s+',' ',line) for line in self.info] #spaces removed
        self.info = [re.sub('^\s','',line) for line in self.info] #start space removed 
        
        
    def RemoveDuplicateWords(self):
        for i in [6,7,8,9,10]:
            self.info[i] = self.info[i].replace(':',': ')
           

        for i in [6,7,8]:
            self.info[i] = self.info[i].replace(',','')
            self.info[i] = ' '.join(dict.fromkeys(self.info[i].split())) 
        
        
            
    def CleanScore(self):
        '''Remove last digit from score, if there is no score , Score = N/A'''
        
        try:
            self.info[11] = (re.findall('.+\d+\.\d+',self.info[11])[0])[:-1] #remove the words from the score column
        except:
            self.info[11] = 'Score: N/A'
        
        
            
    def CleanRank(self):
        '''Extract only the Ranked value from the string'''
        
        try:
            self.info[12] = (re.findall('^Ranked:\s\D\d+',self.info[12]))[0][:-1]  #remove words from ranked column and last digit 
        except:
            self.info[12] = 'Ranked: N/A'
            
        
    def AddtoDF(self,DF):
        '''Take the infomation statistics, convert it to a DataFrame 
        then concatenate the DataFrame with the FinalData frame'''
        
        add_row = {}  # dictionary of the row that will be added to DF

        for line in self.info:
            i = line.split(': ')
            add_row[i[0]] = i[1]
        
       
        df_new_row = pd.DataFrame([add_row]) # row that will be added 
        FinalDataFrame = pd.concat([DF, df_new_row]) # concat the data frames
        return FinalDataFrame
          
        

The Final Data Frame of all the data that is being collected must be defined:

In [19]:
FinalDataFrame = pd.DataFrame( columns = column_head, dtype = str)

In [None]:
start_time = time()
request = 0

    
for code in URL_NO:
    link = 'https://myanimelist.net/manga/'+str(code)
    manga = AnimeData(link)
    
    #limit time in between requests
    sleep(randint(5,10))
        
    #monitor the request
    request += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s code: {}'.format(request,request/elapsed_time,code))
    
    if get(link).status_code == 200:  #OK Success Status
        manga.RelevantInfo()
        manga.RemoveNewLine()
        manga.RelevantColumns()
        manga.RemoveWhitespace()
        manga.RemoveDuplicateWords()
        manga.CleanScore()
        manga.CleanRank()
        FinalDataFrame = manga.AddtoDF(FinalDataFrame)
        
        
    else: #Webpage not found
        print(f' html status : {get(link).status_code} ; Code : {code} ; does not exist')
             

#### Save Data Frame as CSV


In [None]:
FinalDataFrame
#FinalDataFrame.to_csv('FourthAnime_Data.csv',index = False)