# Three investigators - part 1

A project for scraping and analysing data from a fan site on the audio book called '[The three investigators](https://en.wikipedia.org/wiki/Three_Investigators#Germany)'

**Part 1: Webscraping**

- seting up list of pages to scrape data from
- scraping tables on meta data, actors, content and ratings for each episode
- saving data outputs in csv files

Data is scraped from [Rocky-Beach.com](https://www.rocky-beach.com/)


# Setup

In [1]:
#python version used for this project
from platform import python_version
print(python_version())

3.7.5


In [2]:
# import modules [as specified in requirements.txt]
import pandas as pd
import numpy as np
from  urllib.request import urlopen #to read in html
from bs4 import BeautifulSoup #for web scraping
import re
import requests

# for file directories
import os

%matplotlib inline

In [3]:
#change directory to root folder
os.chdir("..")

# Scrape data of Rocky-beach.com

## Setup list of web pages to scrape

In [4]:
#format: https://www.rocky-beach.com/hoerspiel/folgen/001.html
html_list = []
for i in range(1, 300):
    html = "https://www.rocky-beach.com/hoerspiel/folgen/%s.html" % ("%03d" % i)
    if requests.get(html).status_code == 200:  #check if site exists
        html_list.append(html)
    else:
        break   #stop loop once latest episode detected
html_list[:10]

['https://www.rocky-beach.com/hoerspiel/folgen/001.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/002.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/003.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/004.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/005.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/006.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/007.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/008.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/009.html',
 'https://www.rocky-beach.com/hoerspiel/folgen/010.html']

## Meta data

### Get tables to scrape

In [5]:
# get the table that contains the data we want for each site
meta_tables = [] #list to hold all indices for the correct table for each site

for sites in range(len(html_list)):
    df_list=pd.read_html(html_list[sites]) #read in all dataframes on each site
    table_index = [] #empty list to store correct table index
    for position, df in enumerate(df_list):  #loop through all dataframes on site
        df.dropna(how='all').dropna(axis=1,how='any')
        if "Studio-Infos" in str(df.iloc[:,:]):   #find table we want using keyword
            table_index.append(position)    #store index of that table in the list 
    if not table_index:
        table_index =[np.nan] #if table doesn't exist, store missing value
    table_index=table_index[-1] + 1 #get last element and add one since it's the table we want
    meta_tables.append(table_index)

meta_tables[:4]

[7, 7, 7, 7]

### Get the data

In [7]:
#Get meta data for all pages
meta = []

for i in range(len(html_list)):
    meta_site = pd.read_html(html_list[i], encoding="utf-8")[meta_tables[i]].dropna()#read in the table with the meta data for each site
    if len(meta_site.columns) == 1:
        meta_site[1]= np.nan
    meta_site.columns = [0, 1] #define column names
    title = BeautifulSoup(urlopen(html_list[i]), 'lxml').title.string.replace('(H\xc3\xb6rspiel)', '') #get title
    meta_site = meta_site.append({0: 'Titel:', 1: title}, ignore_index=True) #append title column
    meta_site = meta_site.append({0: 'html', 1: html_list[i]}, ignore_index=True) #add url
    meta_site[0] = meta_site[0].str.replace(":", "").str.strip() #format data
    meta_site["ID"] = "%03d" % (i+1) #add ID column
    meta.append(meta_site)

meta = pd.concat(meta, axis=0, sort=True) #join all individual tables together
meta = meta.groupby(["ID", 0]).agg({1:"first"}).unstack().reset_index()
meta.columns = [col[1] for col in meta.columns]

#remove episode 29
meta = meta.loc[meta["ID"]!="029"]

meta.head()

Unnamed: 0,ID,"""Anudhara"" aufgenommen bei Bantree Records, Worms / Produzent",Buch,Buch und Effekte,Buch und Redaktion,Cover-Illustration,Coverillustration,Design,Effekte und Redaktion,Erscheinungsdatum,...,Regie,Regie und Produktion,Song,Teil A erzählt von,Teil B erzählt von,Teil C erzählt von,Titel,Titelmusik,html,zur vorherigen Folge | zur nächsten Folge
0,1,,H. G. Francis,,,,,,,12.10.1979,...,Heikedine Körting,,,,,,Der Super-Papagei (Hörspiel),,https://www.rocky-beach.com/hoerspiel/folgen/0...,
1,2,,H. G. Francis,,,,,,,12.10.1979,...,Heikedine Körting,,,,,,Der Phantomsee (Hörspiel),,https://www.rocky-beach.com/hoerspiel/folgen/0...,
2,3,,H. G. Francis,,,,,,,12.10.1979,...,Heikedine Körting,,,,,,Der Karpatenhund (Hörspiel),,https://www.rocky-beach.com/hoerspiel/folgen/0...,
3,4,,H. G. Francis,,,,,,,12.10.1979,...,Heikedine Körting,,,,,,Die schwarze Katze (Hörspiel),,https://www.rocky-beach.com/hoerspiel/folgen/0...,
4,5,,H. G. Francis,,,,,,,12.10.1979,...,Heikedine Körting,,,,,,Der Fluch des Rubins (Hörspiel),,https://www.rocky-beach.com/hoerspiel/folgen/0...,


### Save output

In [8]:
meta.columns = meta.columns.str.lower()
meta.to_csv(".\\data\\scraped\\meta.csv", encoding='utf8', index=False)

## Actors

In [9]:
actor_all = []

for i in range(len(html_list)):
    actor = pd.read_html(html_list[i])[5].dropna()
    actor.columns = [0,1]
    actor["ID"] = "%03d" % (i+1) #add ID column
    actor_all.append(actor)
     
actor_all = pd.concat(actor_all, axis=0, sort=True)  #join all individual tables together

#rename columns
actor_all = actor_all.rename(columns={0:"Role", 1:"Actor"})

print(actor_all.shape)
print("")
print(actor_all.head())
print("")

(2873, 3)

                                 Role             Actor   ID
0                 Hitchcock, Erzähler     Peter Pasetti  001
1       Justus Jonas, Erster Detektiv   Oliver Rohrbeck  001
2        Peter Shaw, Zweiter Detektiv    Jens Wawrczeck  001
3  Bob Andrews, Recherchen und Archiv  Andreas Fröhlich  001
4        Mr. Fentriss, Schriftsteller   Richard Lauffen  001



In [15]:
#Save output
actor_all.columns = actor_all.columns.str.lower()
actor_all.to_csv(".\\data\\scraped\\actors.csv", encoding='utf8', index=False)

## Content

In [11]:
content_all = []

for i in range(len(html_list)):
    content = pd.DataFrame(pd.read_html(html_list[i])[3].dropna(how='all').dropna(1).iloc[1])
    content["ID"] = "%03d" % (i+1)
    meta["Source"] = "Rocky-Beach.com"
    content_all.append(content)
    
content_all = pd.concat(content_all, axis=0).rename(columns={1:"content"}) #join all individual objects together
content_all.head()   

Unnamed: 0,content,ID
0,Der neueste Auftrag an die drei Detektive hört...,1
0,Welches Geheimnis verbirgt sich in einem vergi...,2
0,"""Bei mir spukt es!"" Mit diesem verzweifelten A...",3
0,In einem kleinen Wanderzirkus wittern die drei...,4
0,Alfred Hitchcock und die drei Detektive (Firme...,5


In [16]:
#save output
content_all.columns = content_all.columns.str.lower()
content_all.to_csv(".\\data\\scraped\\content.csv", encoding='utf8', index=False)

## Ratings

In [13]:
#https://www.rocky-beach.com/php/project/f_ausgabe.html   
rating1 = pd.read_html("https://www.rocky-beach.com/php/project/f_ausgabe.html", header=[0])[3] 
rating1 = rating1.dropna(how='all', axis=1).rename(columns={"Folge": "Episode","Unnamed: 4": "Rating", "Stimmen":"Ranking", "Unnamed: 7":"Votes"})
rating1 = rating1.dropna(how='all')
rating1["ID"] = ["%03d" % number for number in 
                 (rating1["Episode"].apply(lambda x: re.search(" \((.*?)\)", x).group(1))).astype(float)]
rating1["Episode"] = rating1["Episode"].apply(lambda x: re.search("(.*?)\(", x).group(1))
rating1["Source"] = "Rocky-Beach.com"
rating1

Unnamed: 0,Episode,Bewertung.1,Rang,Ranking,ID,Source
0,Der Super-Papagei,1.6118,4.0,760.0,001,Rocky-Beach.com
1,Der Phantomsee,1.8155,20.0,683.0,002,Rocky-Beach.com
2,Der Karpatenhund,1.6604,8.0,695.0,003,Rocky-Beach.com
3,Die schwarze Katze,2.4022,103.0,649.0,004,Rocky-Beach.com
4,Der Fluch des Rubins,1.8141,19.0,651.0,005,Rocky-Beach.com
...,...,...,...,...,...,...
234,Unter Hochspannung,2.8947,167.0,95.0,992,Rocky-Beach.com
235,Gefangene Gedanken,2.7551,154.0,98.0,993,Rocky-Beach.com
236,Haus der Angst,3.9355,230.0,93.0,994,Rocky-Beach.com
237,Die Geisterlampe,3.1053,193.0,95.0,995,Rocky-Beach.com


In [17]:
#save output
rating1.columns = rating1.columns.str.lower()
rating1.to_csv(".\\data\\scraped\\ratings.csv", encoding='utf8', index=False)