# Title: 20201110_Scrapping_West_End_Winghers_v1

# I) Import Packages

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from typing import List, Any
from datetime import datetime

# II) Function definition

In [218]:
def get_title_stageplay(s):
    t = s.findAll('h2')
    t = [x for x in t if 'Review' in x.text]
    if t== []: 
        return None
    else: 
        t = t[0].text
        if len(t.split('Review – ')) >1:
            split = t.split('Review – ')[1]
            res = split.split(",")[0]
            return res
        else:
            return None



def get_theatre_stageplay(s):
    t = s.findAll('h2')
    t = [x for x in t if 'Review' in x.text]
    if t== []: 
        return None
    else: 
        t = t[0].text
        if len(t.split('Review – ')) > 1:
            split = t.split('Review – ')[1]
            if len(split.split(", ")) > 1:
                res = split.split(", ")[1]
                res = res.replace("\xa0", " ")
                return res
            else:
                return None
        else:
            return None

#def get_author_review(s):
#    list_span = []
#    if s.find_all('span',style="font-family: verdana;") == []:
#        return None
#    else: 
#        for span in s.find_all('span',style="font-family: verdana;"):
#            if span.text == None:
#                list_span.append('')
#            else:
#                list_span.append(span.text)
#        review_by = str([i for i in list_span if 'Review by ' in i][0])
#        res = review_by.split('Review by ')[1]
#        return res

def get_author_review(s):
    a = s.find('a',{'class':"url fn n"})
    if a != None:
        res = a.text
        return res
    else:
        return None


def get_date_review(s):
    x = s.find('small')
    if x != None:
        date_string = x.text
        date_string = " ".join(date_string.split(" ")[1:4])
        d = datetime.strptime(date_string, '%d %B  %Y')
        date_0 = d.strftime('%d/%m/%Y')
        if date_0==None: 
            return None
        else: 
            return date_0
    else:
        return None

def get_rating_review(s):
    t = s.findAll('img',{"loading":'lazy'})
    if t == None:
        return None
    else:
        if len(t) >1:
            t = t[1]
            if len(str(t).split("rating-score-")) >1:
                t = str(t).split("rating-score-")[1]
                t = int(t.split("-")[0])
                res = ["★"]* t
                res = "".join(res)
                return res
            else:
                return None
        else:
            return None

def get_review(s):
    t = s.find('div',{"class":'entry'})
    if t == None:
        return None
    else:
        u = t.findAll('p')
        if u == None:
            return None
        else:
            list_text = []
            for i in range(0,len(u)):
                list_text.append(u[i].text)
            text = "\n".join(list_text).replace("\xa0N"," ").split("Rating")[0]
            return text


# III) Scrapping

In [225]:
master_list = []
for j in range(1,73):
    url = "https://westendwhingers.wordpress.com/"
    section = f"/page/{j}/"
    req = requests.get(url+section)
    soup = BeautifulSoup(req.content, 'html.parser')

    # Capture all the links of reviews and ratings 
    a_content = soup.findAll("a",href=True)
    a_list= list(a_content)
    list_link_reviews = []
    list_ratings =[]
    for i in range(0,len(a_list)):
        if "review-" in a_list[i]['href']:
            list_link_reviews.append(a_list[i]['href'])

    list_link_reviews = [x for x in list_link_reviews if 'https://westendwhingers.wordpress.com/' in x]
    list_link_reviews = [x for x in list_link_reviews if '#' not in x]  
    list_link_reviews = list(set(list_link_reviews))
    master_list += list_link_reviews
master_list = list(set(master_list))
print(f'We are scrapping {len(master_list)} reviews')

We are scrapping 830 reviews


# IV) Dataset Creation

In [222]:
# Initialization of lists:
title = []
reviewer = []
review = []
date = []
theatre = []
newspaper = []
rating = []

In [224]:
len(master_list)

830

In [223]:
master_list = list(set(master_list))
#Store in lists title, date , reviewer , theatre and review of comedy
for k in range(0,len(master_list)): 
    print(master_list[k])
    print(k)
    r = requests.get(master_list[k])
    s = BeautifulSoup(r.text,'html.parser')
    title.append(get_title_stageplay(s))
    date.append(get_date_review(s))
    theatre.append(get_theatre_stageplay(s))
    review.append(get_review(s))
    rating.append(get_rating_review(s))

https://westendwhingers.wordpress.com/2012/04/30/review-south-downs-the-browning-version-harold-pinter-theatre/
0
https://westendwhingers.wordpress.com/2012/11/09/review-steel-pier-union-theatre/
1
https://westendwhingers.wordpress.com/2008/05/18/review-that-face-at-the-duke-of-yorks-theatre/
2
https://westendwhingers.wordpress.com/2007/01/24/review-there-came-a-gypsy-riding/
3
https://westendwhingers.wordpress.com/2012/08/28/review-its-all-true-white-bear-theatre/
4
https://westendwhingers.wordpress.com/2007/09/05/review-chatroomcitizenship-national-theatre/
5
https://westendwhingers.wordpress.com/2014/03/21/review-i-cant-sing-the-x-factor-musical-in-3-acts-london-palladium/
6
https://westendwhingers.wordpress.com/2011/01/27/review-twisted-tales-lyric-hammersmith/
7
https://westendwhingers.wordpress.com/2009/10/21/review-terror-2009-southwark-playhouse/
8
https://westendwhingers.wordpress.com/2014/04/04/review-let-the-right-one-in-apollo-theatre/
9
https://westendwhingers.wordpress.co

In [226]:
newspaper = ['West End Winghers'] * (k+1)
reviewer = ['Phil'] * (k+1)
# Check length of arrays 
display(len(date))
display(len(reviewer))
display(len(title))
display(len(theatre))
display(len(newspaper))
display(len(review))
display(len(rating))

830

830

830

830

830

830

830

In [228]:
# Dataset creation 
df_theatre = pd.DataFrame({'Newspaper':newspaper,
                      'Date':date,
                      'Reviewer':reviewer,
                      'Title':title,
                      'Theatre':theatre,
                      'Rating':rating,
                      'Review':review
                      })

In [229]:
df_theatre

Unnamed: 0,Newspaper,Date,Reviewer,Title,Theatre,Rating,Review
0,West End Winghers,30/04/2012,Phil,South Downs / The Browning Version,Harold Pinter Theatre,★★★★★,\nOne Man’s Two Guvnors may be another man’s p...
1,West End Winghers,09/11/2012,Phil,Steel Pier,Union Theatre,★★★,"\nSPOILERS. Imagine They Shoot Horses, Don’t T..."
2,West End Winghers,18/05/2008,Phil,That Face at the Duke of York’s Theatre,,,The Whingers are feeling a tad existential tod...
3,West End Winghers,24/01/2007,Phil,There Came a Gypsy Riding,,,To the Commercial Environmental Health Service...
4,West End Winghers,28/08/2012,Phil,It’s All True,White Bear Theatre,★★★,"Yes, it’s been a while.\nWe’ve been busy. Wall..."
...,...,...,...,...,...,...,...
825,West End Winghers,27/08/2011,Phil,High Jinks with the Hamiltons! Udderbelly’s Pa...,Edinburgh Fringe,,\nForgive us this one and indulge Phil for a m...
826,West End Winghers,18/06/2013,Phil,The Amen Corner,National Theatre,★★★★,No sign of Simon Cowell around but anyway it w...
827,West End Winghers,02/12/2013,Phil,Emil and the Detectives,National Theatre,★★★,If you feel like watching the detectives you’d...
828,West End Winghers,18/10/2016,Phil,The Boys In The Band,Park Theatre,★★★★,Second in a row of our series of plays featuri...


## V) Output Dataset in csv and excel format

In [230]:
df_theatre.to_excel("20201112_Scrapping__West_End_Winfhers_v1.xlsx")
df_theatre.to_csv("20201112_Scrapping__West_End_Winghers_v1.csv")