
# Title: 20201107_P_S_Theatre_v1.py



# I) Import Packages

In [9]:
import os
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from typing import List, Any
from datetime import datetime

# II) Function definition

In [10]:
def get_title_stageplay(s):
    t = s.find('meta',{'property':"og:title"})
    if t==None: 
        return None
    else: 
        t = t.get('content')
        res = t.split(sep=' at ')[0]
        res = res.replace('REVIEW: ','')
        return res

def get_theatre_stageplay(s):
    t = s.find('meta',{'property':"og:title"})
    if t==None: 
        return None
    else: 
        t = t.get('content')
        res = t.split(sep='|')[0].split(sep='(')[-1]
        res = res.replace(')','')
        if ' at ' in res:
            res = res.split('at ')[1]
            return res
        else:
            return None

#def get_author_review(s):
#    list_span = []
#    if s.find_all('span',style="font-family: verdana;") == []:
#        return None
#    else: 
#        for span in s.find_all('span',style="font-family: verdana;"):
#            if span.text == None:
#                list_span.append('')
#            else:
#                list_span.append(span.text)
#        review_by = str([i for i in list_span if 'Review by ' in i][0])
#        res = review_by.split('Review by ')[1]
#        return res

def get_author_review(s):
    if len(str(s).split("Review by ")) > 1:
        split = str(s).split("Review by ")[1].split()
        res = split[0]+" "+ re.sub('<[^>]+>','',split[1])
        return res
    else:
        return None

def get_date_review(s):
    x = s.find('h2',{'class':"date-header"})
    date_string = x.text.split(',')[1][1:]
    d = datetime.strptime(date_string, '%d %B %Y')
    date_0 = d.strftime('%d/%m/%Y')
    if date_0==None: 
        return None
    else: 
        return date_0

def get_rating_review(s):
    list_span_review = []
    if s.find_all('span',style="font-family: verdana;") == []:
        return None
    else:
        for span in s.find_all('span',style="font-family: verdana;"):
            if span.text == None:
                list_span_review.append('')
            else:
                list_span_review.append(span.text)
        rating = str([i for i in list_span_review if 'Rating:' in i][0])
        res = rating.split('Rating:')[1]
        res = re.findall(r"★",res)
        res = "".join(res)
        return res

def get_review(s): 
    x = s.find('div',{'itemprop':'description articleBody'})
    if x==None: 
        return None
    else: 
        res = x.text
        res = res.replace('\n','')
        res = res.replace('\xa0','')
        res = res.split("Review by ")[0]
        return res


# III) Scrapping

In [11]:
url = "http://www.pocketsizetheatre.com"
section = "/p/play-reviews.html"
req = requests.get(url+section)
soup = BeautifulSoup(req.content, 'html.parser')

# Capture all the links of reviews and ratings 
a_content = soup.findAll("a",href=True)
a_list= list(a_content)
list_link_reviews = []
list_ratings =[]
for i in range(0,len(a_list)):
    list_ratings.append("".join(re.findall(r"★",str(a_list[i]))))
    if "review-" in a_list[i]['href']:
        list_link_reviews.append(a_list[i]['href'])
        
list_ratings = [x for x in list_ratings if x]
print(f'We are scrapping {len(list_link_reviews)} reviews')

We are scrapping 330 reviews


In [43]:
len(list_ratings)

330

# IV) Dataset Creation

In [12]:
# Initialization of lists:
title = []
reviewer = []
review = []
date = []
theatre = []
rating = []
newspaper = []

In [13]:
#Store in lists title, date , reviewer , theatre and review of comedy
for k in range(0,len(list_link_reviews)): 
#for k in range(0,20): 
    print(list_link_reviews[k])
    r = requests.get(list_link_reviews[k])
    s = BeautifulSoup(r.text,'html.parser')
    title.append(get_title_stageplay(s))
    date.append(get_date_review(s))
    reviewer.append(get_author_review(s))
    theatre.append(get_theatre_stageplay(s))
    review.append(get_review(s))
newspaper = ['Pocket Size Theatre'] * (k+1)
rating = list_ratings[:k+1]

http://www.pocketsizetheatre.com/2020/10/review-lone-flyer-at-watermill-newbury.html
http://www.pocketsizetheatre.com/2020/10/review-buyer-and-cellar-at-above-stag.html
http://www.pocketsizetheatre.com/2020/09/review-bloodshot-at-watermill-theatre.html
http://www.pocketsizetheatre.com/2020/09/review-woyzeck-at-theatro-technis.html
http://www.pocketsizetheatre.com/2020/08/review-educating-rita-at-minack-theatre.html
http://www.pocketsizetheatre.com/2020/08/review-hound-of-baskervilles-at.html
http://www.pocketsizetheatre.com/2020/03/review-bin-juice-at-vaults.html
http://www.pocketsizetheatre.com/2020/03/review-love-love-love-at-lyric.html
http://www.pocketsizetheatre.com/2020/03/review-kite-runner-at-richmond-theatre.html
http://www.pocketsizetheatre.com/2020/03/review-waiter-theres-murder-in-my-soup.html
http://www.pocketsizetheatre.com/2020/03/review-relatively-speaking-at-mill-at.html
http://www.pocketsizetheatre.com/2020/03/review-some-mothers-do-ave-em-at.html
http://www.pocketsiz

In [14]:
# Check length of arrays 
display(len(rating))
display(len(date))
display(len(reviewer))
display(len(title))
display(len(theatre))
display(len(review))

330

330

330

330

330

330

In [15]:
# Dataset creation 
df_theatre = pd.DataFrame({'Newspaper':newspaper,
                      'Date':date,
                      'Reviewer':reviewer,
                      'Title':title,
                      'Theatre':theatre,
                      'Rating':rating,
                      'Review':review
                      })

In [20]:
df_theatre.head(3)

Unnamed: 0,Newspaper,Date,Reviewer,Title,Theatre,Rating,Review
0,Pocket Size Theatre,26/10/2020,Nick Wayne<span,Lone Flyer,"the Watermill, Newbury",★★★★,The Watermill continues its indoor reopening s...
1,Pocket Size Theatre,17/10/2020,Mark Swale,Buyer and Cellar,Above the Stag,★★★★,"Originally performed by Michael Urie, Buyer an..."
2,Pocket Size Theatre,26/09/2020,Nick Wayne,Bloodshot,the Watermill Theatre,★★★★★,After a very successful outdoor summer season ...


In [195]:
#### Cleaning final dataset step
df_theatre_v2 = df_theatre.copy()
df_theatre_v2['Reviewer'] = df_theatre_v2['Reviewer'].apply(lambda x: x.replace('<span','') if x != None else x)
df_theatre_v2['Review'] = df_theatre_v2['Review'].apply(lambda x: '' if '<!-- /*' in x  else x)
df_theatre_v2 = df_theatre_v2.drop(283).reset_index(drop=True)
df_theatre_v2 = df_theatre_v2.drop(284).reset_index(drop=True)
df_theatre_v2 = df_theatre_v2.drop(285).reset_index(drop=True)
list_ratings_1 = list(df_theatre_v2['Rating'])
list_ratings_2 = list_ratings_1
list_ratings_2.pop(209)
list_ratings_2[282] = '★★★★'
list_ratings_2.insert(283,'★★★★')
list_ratings_2.insert(284,'★★★★')
list_ratings_2 = list_ratings_2[:-1]
df_theatre_v2['Rating'] = list_ratings_2
df_theatre_v2 = df_theatre_v2.append({'Newspaper':'	Pocket Size Theatre',
                      'Date':'10/07/2014',
                      'Reviewer':'Harriet Langdown',
                      'Title':'The Curious Incident of the Dog in the Night-Time',
                      'Theatre':'the Gielgud Theatre',
                      'Rating':'★★★★★',
                      'Review':'Mark Haddon’s “The Curious Incident of the Dog in the Night-Time” was published in 2003. It was the winner of more than seventeen literary awards and has now been adapted by Simon Stephens for the production that opened at the Apollo theatre in March 2013 after a successful stint at the Cottesloe with the National Theatre. It received much critical acclaim and won seven Olivier Awards including Best New Play. It has now transferred to the Gielgud Theatre where it re-opened on Monday 7th July 2014.The Curious Incident of the Dog in the Night-time follows the story of 15 year old Christopher Boone who suffers from Aspergers Syndrome, and Autism. After his next door neighbour’s dog, Wellington, is killed with a garden fork, Christopher sets it upon himself to be a detective who must be “very very brave” and find out who the murderer is. As his journey takes off, we see his life at his “special school” where the kids are “stupid, although I’m not supposed to say that” (says Christopher). Despite his illnesses, Christopher is a superb mathematician and has set it upon himself to be the first 15 year old at his school to sit a Maths A-Level exam.  We also learn about his family life, and his career aspirations for the future as we follow his journey from 12:07am on the night of the dog’s murder…Both my parents have read Mark Haddon’s novel, so I had heard much about it, but made a conscious decision not to read it before I saw the play because I knew I would spend more time worrying about it as an adaptation rather than for the play it actually was. I went in completely blind, but from the moment I took my seat, I was hooked. Created by Academy Award nominated designer Bunny Christie, the set initially appeared to be deep set stage set out like a giant cubed grid – all in black and white. Once the play started, it was clear everything was not as simplistic as it originally seemed. I felt like the cube was a representation of the inside of Christopher’s mind and I was trapped in this giant, confused and chaotic state with him for the duration.The floor is a giant chalkboard, and at each cross point of the stage; on the floor, the walls and the ceiling, there was a hidden LED light which provided so much visual spectacle for the show, I was blown away. Lighting designer, Paule Constable has previously won 4 Olivier Awards and a Tony, amongst others, and she represented herself with stunning visuals which carried the entire production. It was so intelligent and original, I just loved it.With such a remarkable set and stunning lighting, the actors could easily have been lost amidst such a spectacle, but they weren’t. Simply put, Graham Butler is nothing short of phenomenal. He was totally convincing as “Christopher Boone”, performing his role with sensitivity and true expertise. It was a master-class in playing a “handicapped” role without being distasteful. Mr Butler was astounding and is the heart, body and soul of this production. His physicality was exceptional and his voice was used in such a remarkable way, much to the acclaim of voice coach Barbara Houseman and dialect coach Jeannette Nelson. His performance was heart-warming, engrossing and sympathetic all at the same time. He was just totally fabulous.As Christopher’s father, “Ed”, Nicolas Tennant was also superb. His shining moments were as the “endearing and sensitive” father as opposed to the “angry and vengeful” man he can sometimes be. I personally felt his emotional journey was as great as Christopher’s and despite all his flaws, I was rooting for him to be happy and to be the father that Christopher needed.From the ensemble, Sam Bond was excellent across multiple roles. He provided some much needed comedic relief towards the end of Act One, and won some of the biggest laughs of the night. Similarly, Vivienne Acheampong shone in her roles, and won the crowd over as a Caribbean “Information Guide” on the London Underground. It would also be unfair not to mention Battenberg the rat as “Toby”. He’s a real star in the making! The ensemble as a whole provided excellent figures for interpretive movements on the stage that reminded me of a kooky drama class exercise, but it wasn’t tacky or laughable – it worked. The interpretation was so cleverly directed by Marianne Elliott, the result was an accomplished and exceptional production, for which she should be recognised.Overall, this play is totally original and unlike anything else I have ever seen. The lighting and set design were the highlights for me because of their originality, but that is in no way a discredit to Graham Butler who was utterly magnificent. I could quote my favourite moments all day, but we would be here…all day. The script is magnificent and the actors provide such heart to the words, you need to see it from them, rather than read it from me.I cannot implore you enough to go and see this remarkable piece of theatre. My audience was utterly captivated throughout the duration; I heard laughter, and gasps of shock and “awwws” many many times, but never any whispers of confusion or talking during some of the long silences on stage. I cried near the end because of how much love there is in this production. It might be a total cliché but it was the heart of this play that brought me, and several others around me, to tears. It was exhausting on my eyes, loud in my ears, and an emotional roller-coaster in my heart, but it was truly spectacular. Trust me when I say that this is a play not to be missed. It is sensational.'}, ignore_index=True)
df_theatre_v2.head(3)

Unnamed: 0,Newspaper,Date,Reviewer,Title,Theatre,Rating,Review
0,Pocket Size Theatre,26/10/2020,Nick Wayne,Lone Flyer,"the Watermill, Newbury",★★★★,The Watermill continues its indoor reopening s...
1,Pocket Size Theatre,17/10/2020,Mark Swale,Buyer and Cellar,Above the Stag,★★★★,"Originally performed by Michael Urie, Buyer an..."
2,Pocket Size Theatre,26/09/2020,Nick Wayne,Bloodshot,the Watermill Theatre,★★★★★,After a very successful outdoor summer season ...


In [196]:
df_theatre_v2.tail(3)

Unnamed: 0,Newspaper,Date,Reviewer,Title,Theatre,Rating,Review
325,Pocket Size Theatre,16/09/2014,Sophie Tergeist,Ghosts from a Perfect Place,the Arcola Theatre,★★★★,"This month, and until 11 October, the innovati..."
326,Pocket Size Theatre,11/09/2014,,Great Britain,Britain,★★★★,"This play is a look at society and the media, ..."
327,\tPocket Size Theatre,10/07/2014,Harriet Langdown,The Curious Incident of the Dog in the Night-Time,the Gielgud Theatre,★★★★★,Mark Haddon’s “The Curious Incident of the Dog...


## V)Export to csv and excel format

In [197]:
df_theatre_v2.to_csv('20201108_pocketsizetheatre_v1.csv')
df_theatre_v2.to_excel('20201108_pocketsizetheatre_v1.xlsx')