In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from yelp.com
def url_to_transcript(url):
    page = requests.get(url).text #gets all the content from that url
    soup = BeautifulSoup(page, "lxml") #reads all the text as a html documnet
    text = [p.text for p in soup.find_all(class_="f3cNr0xDcwvMaBkpjQj5")]
    print(url)
    return text
# t9JcvSL3Bsj1lxMSi3pz h_kb2PFOoyZe1skyGiz9 Ti64w3n01MDTYZb59n6Q
# URLs of transcripts in scope
urls = ['https://www.opentable.com/r/piatti-san-antonio?corrid=4cec5420-b025-46d0-9c85-c7593b9a80fb&avt=eyJ2IjoyLCJtIjowLCJwIjowLCJzIjowLCJuIjowfQ&p=2&sd=2022-05-31T21%3A00%3A00']

# Comedian names
webpage = ['opentable']

In [2]:
# Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

https://www.opentable.com/r/piatti-san-antonio?corrid=4cec5420-b025-46d0-9c85-c7593b9a80fb&avt=eyJ2IjoyLCJtIjowLCJwIjowLCJzIjowLCJuIjowfQ&p=2&sd=2022-05-31T21%3A00%3A00


In [3]:
# Pickle files for later use

# Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(webpage):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

A subdirectory or file transcripts already exists.


In [4]:
# Load pickled files
data = {}
for i, c in enumerate(webpage):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [5]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['opentable'])

In [6]:
# More checks
data['opentable']

['Excellent service great experience. Enjoyed appetizer, dinner, wine and dessert for under 100$',
 'First time here and it was amazing. The food was great! Staff was friendly and overall really enjoyed this place. Definitely recommend',
 'Josh our waiter was the best part of the night.  He could not have been more accommodating.  The martinis were great and the food predictably good.',
 "Very loud, then we ordered what was on the menu, but when the food arrived some of it was not included in our order... When we asked the why question,  we were told that corporate had changed the menu but they didn't have time to update the menu just yet, so those items were not included in what we just ordered... What?!  We insisted that we should receive what we had ordered, and what was shown on the menu, so they relented and brought the items, but of course it was later, more than half way through our meal.  The wait person told us we were not the only table to complain.  Also, there is a 3% surch

In [7]:
for x in data.values():
  print(x)



In [8]:
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data)
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
0,"Excellent service great experience. Enjoyed appetizer, dinner, wine and dessert for under 100$"
1,First time here and it was amazing. The food was great! Staff was friendly and overall really enjoyed this place. Definitely recommend
2,Josh our waiter was the best part of the night. He could not have been more accommodating. The martinis were great and the food predictably good.
3,"Very loud, then we ordered what was on the menu, but when the food arrived some of it was not included in our order... When we asked the why quest..."
4,Food was delicious. Service was wonderful. One of our favorite restaurants.
5,I ordered a chicken panini and it was not good. The chicken was dry so I only ate less the half of it. There were flies buzzing all around us (and...
6,Outstanding service. Great atmosphere and terrific good. My Old Fashioned from the bar was excellent too.
7,Celebrated My friend’s birthday and the two of us had a lovely dinner. Food and service were excellent as usual.
8,Went for lunch. Increased prices and no lunch specials make it a less appealing option especially when the food itself is nothing spectacular. The...
9,Birthday dinner for our daughter. Great food; incredible service. Love Piatti at the Quarry!


In [9]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [10]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
0,excellent service great experience enjoyed appetizer dinner wine and dessert for under
1,first time here and it was amazing the food was great staff was friendly and overall really enjoyed this place definitely recommend
2,josh our waiter was the best part of the night he could not have been more accommodating the martinis were great and the food predictably good
3,very loud then we ordered what was on the menu but when the food arrived some of it was not included in our order when we asked the why question ...
4,food was delicious service was wonderful one of our favorite restaurants
5,i ordered a chicken panini and it was not good the chicken was dry so i only ate less the half of it there were flies buzzing all around us and we...
6,outstanding service great atmosphere and terrific good my old fashioned from the bar was excellent too
7,celebrated my friend’s birthday and the two of us had a lovely dinner food and service were excellent as usual
8,went for lunch increased prices and no lunch specials make it a less appealing option especially when the food itself is nothing spectacular the f...
9,birthday dinner for our daughter great food incredible service love piatti at the quarry


In [11]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [12]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
0,excellent service great experience enjoyed appetizer dinner wine and dessert for under
1,first time here and it was amazing the food was great staff was friendly and overall really enjoyed this place definitely recommend
2,josh our waiter was the best part of the night he could not have been more accommodating the martinis were great and the food predictably good
3,very loud then we ordered what was on the menu but when the food arrived some of it was not included in our order when we asked the why question ...
4,food was delicious service was wonderful one of our favorite restaurants
5,i ordered a chicken panini and it was not good the chicken was dry so i only ate less the half of it there were flies buzzing all around us and we...
6,outstanding service great atmosphere and terrific good my old fashioned from the bar was excellent too
7,celebrated my friends birthday and the two of us had a lovely dinner food and service were excellent as usual
8,went for lunch increased prices and no lunch specials make it a less appealing option especially when the food itself is nothing spectacular the f...
9,birthday dinner for our daughter great food incredible service love piatti at the quarry


In [13]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,able,accommodating,acknowledgement,added,additional,admin,advised,air,alfredo,amazing,...,wet,whatseems,whatsoever,wine,wonderful,wouldnt,wrong,years,youre,yukon
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0,0,0,0,...,0,1,0,1,0,1,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm_sentiment_analysis.pkl")

In [15]:
data_clean.to_pickle('data_clean_sentiment_analysis.pkl')
pickle.dump(cv, open("cv_sentiment_analysis.pkl", "wb"))

In [16]:
# Let's pickle it for later use
data_df.to_pickle("corpus_sentiment_analysis.pkl")