# Scraping Wowhead for data

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

### Example response is messy

In [2]:
response = requests.get('https://www.wowhead.com/quest=2')

In [3]:
response.text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport" content="initial-scale=1">\n    <script>var _sf_startpt=(new Date()).getTime();</script>\n    <title>Sharptalon\'s Claw - Quest - World of Warcraft</title>\n\n    <meta name="description" content="Bring Sharptalon&apos;s Claw to Senani Thunderheart at Silverwind Refuge. A level 7 Ashenvale Quest. +250 reputation with Orgrimmar. Added in Classic World of Warcraft.">\n\n    <meta name="google-site-verification" content="SScjPPnjqJ0lGTlZeVs9x0D3_jmctKZ-6nE4cvfGL00">\n    <meta property="twitter:title" content="Sharptalon&apos;s Claw">\n    <meta property="twitter:card" content="summary_large_image">\n    <meta property="twitter:image" content="https://wow.zamimg.com/uploads/screenshots/normal/912483-sharptalons-claw.jpg">\n    <meta property="twitter:imageUrl" content="https://wow.zamimg.com/uploads/screenshots/normal/912483-sharptalons-claw.jpg">\n    <meta property="twitter:site:id" content

##### Use BeautifulSoup for a nicer output

In [4]:
soup = BeautifulSoup(response.text)

In [5]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="initial-scale=1" name="viewport"/>
<script>var _sf_startpt=(new Date()).getTime();</script>
<title>Sharptalon's Claw - Quest - World of Warcraft</title>
<meta content="Bring Sharptalon's Claw to Senani Thunderheart at Silverwind Refuge. A level 7 Ashenvale Quest. +250 reputation with Orgrimmar. Added in Classic World of Warcraft." name="description"/>
<meta content="SScjPPnjqJ0lGTlZeVs9x0D3_jmctKZ-6nE4cvfGL00" name="google-site-verification"/>
<meta content="Sharptalon's Claw" property="twitter:title"/>
<meta content="summary_large_image" property="twitter:card"/>
<meta content="https://wow.zamimg.com/uploads/screenshots/normal/912483-sharptalons-claw.jpg" property="twitter:image"/>
<meta content="https://wow.zamimg.com/uploads/screenshots/normal/912483-sharptalons-claw.jpg" property="twitter:imageUrl"/>
<meta content="17258481" property="twitter:site:id"/>
<meta content="@Wowhead" property="twitter:site"/>


##### Use the following IDs to get progress and completion text

In [6]:
soup.find(id = 'lknlksndgg-progress').text

'What have you there, <class>? Could it be....?'

In [7]:
soup.find(id = 'lknlksndgg-completion').text

'You have slain the beast?I owe you a great debt, friend. That beast terrorized many over the years, but its death holds special meaning to me.You see, recently my brother fell victim to these very claws... I shall sleep well now, knowing that he has been avenged.'

##### Add some structure

In [9]:
response = requests.get('https://www.wowhead.com/quest=2')
soup = BeautifulSoup(response.text)
prog = soup.find(id = 'lknlksndgg-progress').text
comp = soup.find(id = 'lknlksndgg-completion').text
print('Progress: '+prog, '\nCompletion: '+comp)


Progress: What have you there, <class>? Could it be....? 
Completion: You have slain the beast?I owe you a great debt, friend. That beast terrorized many over the years, but its death holds special meaning to me.You see, recently my brother fell victim to these very claws... I shall sleep well now, knowing that he has been avenged.


### Bring it all together to compile responses into 78 files

In [6]:
for j in range(0, 78):
    quest_ids = []  #keep a list of quest IDs
    quest_dict = {} #dictionary with progress and completion text
    for i in range(1000*j+1, ((j+1)*1000+1)):   #this looks like (1, 1001), (1001, 2001), (2001, 3001), ...
        time.sleep(0.2) #give it some time to stop rate limiting
        response = requests.get(f'https://www.wowhead.com/quest={i}')
        print(f'New response! Time: {datetime.now()}')  #keep track of every response
        print(f'With j = {j} and i = {i}')
        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            try:
                prog = soup.find(id = 'lknlksndgg-progress').text
            except:
                prog = ''
            try:
                comp = soup.find(id = 'lknlksndgg-completion').text
            except:
                comp = ''
            quest_dict[i] = {'Progress': prog, 'Completion': comp}
            quest_ids.append(i)
    pd.DataFrame(quest_ids).to_csv(f'quest_id_list_{j+1}.csv')
    pd.DataFrame(quest_dict).to_csv(f'quest_dict_list_{j+1}.csv')
    print('New CSV with Id '+str(j+1)+f' made at {datetime.now()}!')    #Create a file every 1000 responses for 78 files

New response! Time: 2023-08-19 17:04:31.563114
With j = 77 and i = 77001
New response! Time: 2023-08-19 17:04:32.812768
With j = 77 and i = 77002
New response! Time: 2023-08-19 17:04:34.334952
With j = 77 and i = 77003
New response! Time: 2023-08-19 17:04:35.556932
With j = 77 and i = 77004
New response! Time: 2023-08-19 17:04:36.910858
With j = 77 and i = 77005
New response! Time: 2023-08-19 17:04:38.334087
With j = 77 and i = 77006
New response! Time: 2023-08-19 17:04:39.967019
With j = 77 and i = 77007
New response! Time: 2023-08-19 17:04:41.612359
With j = 77 and i = 77008
New response! Time: 2023-08-19 17:04:42.999629
With j = 77 and i = 77009
New response! Time: 2023-08-19 17:04:44.466834
With j = 77 and i = 77010
New response! Time: 2023-08-19 17:04:45.854168
With j = 77 and i = 77011
New response! Time: 2023-08-19 17:04:48.565374
With j = 77 and i = 77012
New response! Time: 2023-08-19 17:04:50.672223
With j = 77 and i = 77013
New response! Time: 2023-08-19 17:04:52.119817
With