In [2]:
import requests

In [3]:
from bs4 import BeautifulSoup

In [4]:
url = requests.get("https://www.yelp.com/biz/tesla-san-francisco?osq=Tesla+Dealership")

In [5]:
url.status_code

200

In [6]:
url.text

'<!DOCTYPE html><html lang="en-US" prefix="og: http://ogp.me/ns#" style="margin: 0;padding: 0; border: 0; font-size: 100%; font: inherit; vertical-align: baseline;"><head><script>document.documentElement.className=document.documentElement.className.replace(/\x08no-js\x08/,"js");</script><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta http-equiv="Content-Language" content="en-US" /><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link rel="mask-icon" sizes="any" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/b2bb2fb0ec9c/assets/img/logos/yelp_burst.svg" content="#FF1A1A"><link rel="shortcut icon" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/b05852393ae5/assets/img/logos/favicon.ico"><script> window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;window.ygaPageStartTime=new Date().getTime();</script><script async src="https://www.google-analytics.com/ana

In [7]:
soup = BeautifulSoup(url.text, "html.parser")

In [8]:
divs = soup.findAll(class_="comment__09f24__gu0rG css-qgunke")
print(divs)

[<p class="comment__09f24__gu0rG css-qgunke"><span class="raw__09f24__T4Ezm" lang="en">Consider the Tesla Model 3. Mine is fun, which brings joy and contentment.  I look forward to each drive. <br/><br/>Over the air updates and a good autopilot system keeps the car renewable and removes some of the mind numbing heavy traffic.</span></p>, <p class="comment__09f24__gu0rG css-qgunke"><span class="raw__09f24__T4Ezm" lang="en">This is for our amazing experience with the service center.  They went above and beyond for us and it was the best repair experience we've ever had with any car.<br/><br/>We had a warranty repair on our Model Y that was going to take a long time to complete.  The shop was incredibly understanding and helpful about our kids and car seats and worked with us to plan ahead.  <br/><br/>When the day came they fixed a major issue for us 40% faster than expected, and they fixed a minor issue even though we didn't think the parts were in stock!  Finally, they were ultra flexib

In [9]:
reviews = []
for div in divs:
    reviews.append(div.find('span').text)

In [10]:
reviews[0]

'Consider the Tesla Model 3. Mine is fun, which brings joy and contentment. \xa0I look forward to each drive. Over the air updates and a good autopilot system keeps the car renewable and removes some of the mind numbing heavy traffic.'

# Analysing the data

In [11]:
import pandas as pd
import numpy as np

In [12]:
df = pd.DataFrame(np.array(reviews), columns=["review"])

In [13]:
len(df['review'])

10

In [14]:
df["word_count"] = df["review"].apply(lambda x: len(x.split()))

In [15]:
df['char_count']=df['review'].apply(lambda x: len(x))

In [16]:
def avg_word(review):
  words = review.split()
  return (sum(len(word) for word in words) / len(words))

# Calculate average words
df['avg_word'] = df['review'].apply(lambda x: avg_word(x))

In [17]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756
1,This is for our amazing experience with the se...,110,587,4.3
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307
3,Do you love having your car held hostage for t...,59,309,4.254237
4,I am appalled by the poor service at this Tesl...,134,696,4.171642


In [78]:
!pip3 install nltk

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package stopwords to /home/wsuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/wsuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/wsuser/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [41]:
# Import stopwords
from nltk.corpus import stopwords 

In [42]:
stop_words = stopwords.words('english')
df['stopword_count'] = df['review'].apply(lambda x: len([x for x in x.split() if x in stop_words]))

In [43]:
df.describe()

Unnamed: 0,word_count,char_count,avg_word,stopword_count,stopword_rate
count,10.0,10.0,10.0,10.0,10.0
mean,110.2,601.9,4.377593,46.5,0.432095
std,90.554833,498.24948,0.427467,36.539932,0.04643
min,13.0,56.0,3.384615,6.0,0.341463
25%,44.25,249.75,4.265678,21.0,0.408327
50%,84.5,454.5,4.388823,37.5,0.439145
75%,132.25,701.25,4.593144,57.75,0.458359
max,293.0,1571.0,4.981481,121.0,0.5


In [44]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819
3,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678
4,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836


In [45]:
df["stopword_rate"] = df["stopword_count"] /df["word_count"]

In [46]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819
3,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678
4,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836


In [47]:
df.sort_values(by="stopword_rate")

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463
8,Typically I go to the Tesla Berkeley service c...,231,1328,4.753247,90,0.38961
3,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678
9,Horrible customer service:1) Placed order via ...,293,1571,4.327645,121,0.412969
4,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819
5,Dear dad can you buy me a Tesla to drive at gr...,13,56,3.384615,6,0.461538
6,I ended up filing a complaint against them wit...,54,322,4.981481,26,0.481481
7,Had impossible time with this service center a...,40,217,4.45,20,0.5


# Data Cleaning

In [51]:
df["lowercase"] = df["review"].apply(lambda x: " ".join(word.lower() for word in x.split()))

In [56]:
df["Punctuation"] = df["lowercase"].str.replace('[^\w\s]',"")

  df["Punctuation"] = df["lowercase"].str.replace('[^\w\s]',"")


In [61]:
df["Stopwords"] = df["Punctuation"].apply(lambda x:" ".join(word for word in x.split() if word not in stop_words))

In [62]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,lowercase,Punctuation,Stopwords
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,"consider the tesla model 3. mine is fun, which...",consider the tesla model 3 mine is fun which b...,consider tesla model 3 mine fun brings joy con...
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,this is for our amazing experience with the se...,this is for our amazing experience with the se...,amazing experience service center went beyond ...
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819,kevin and bridget were amazing. i purchased a ...,kevin and bridget were amazing i purchased a u...,kevin bridget amazing purchased used tesla abs...
3,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678,do you love having your car held hostage for t...,do you love having your car held hostage for t...,love car held hostage things arent fault locat...
4,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836,i am appalled by the poor service at this tesl...,i am appalled by the poor service at this tesl...,appalled poor service tesla location first can...


In [64]:
pd.Series(" ".join(df['Stopwords']).split()).value_counts()[:30]

service        15
tesla          11
car            11
appointment     7
day             6
time            6
experience      6
us              5
get             5
didnt           5
melissa         5
app             5
model           5
would           4
center          4
upgrade         4
repair          4
order           4
weeks           3
back            3
location        3
part            3
go              3
ap              3
advise          3
poor            3
took            3
kevin           3
bridget         3
say             3
dtype: int64

In [65]:
other_stop_words=["us", "get", "would","go", "ap", "melissa", "kevin", "bridget", "say" ]

In [66]:
len(other_stop_words)

9

In [68]:
df["Clean Review"] = df["Stopwords"].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))

In [70]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,lowercase,Punctuation,Stopwords,Clean Review
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,"consider the tesla model 3. mine is fun, which...",consider the tesla model 3 mine is fun which b...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,this is for our amazing experience with the se...,this is for our amazing experience with the se...,amazing experience service center went beyond ...,amazing experience service center went beyond ...
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819,kevin and bridget were amazing. i purchased a ...,kevin and bridget were amazing i purchased a u...,kevin bridget amazing purchased used tesla abs...,amazing purchased used tesla abs picked rockli...
3,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678,do you love having your car held hostage for t...,do you love having your car held hostage for t...,love car held hostage things arent fault locat...,love car held hostage things arent fault locat...
4,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836,i am appalled by the poor service at this tesl...,i am appalled by the poor service at this tesl...,appalled poor service tesla location first can...,appalled poor service tesla location first can...


# Lemmatize Text

In [72]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 16.9 MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [73]:
#Import textblob
from textblob import Word

In [81]:
df["lemmatized"] = df["Clean Review"].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [83]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,lowercase,Punctuation,Stopwords,Clean Review,lemmatized
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,"consider the tesla model 3. mine is fun, which...",consider the tesla model 3 mine is fun which b...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,this is for our amazing experience with the se...,this is for our amazing experience with the se...,amazing experience service center went beyond ...,amazing experience service center went beyond ...,amazing experience service center went beyond ...
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819,kevin and bridget were amazing. i purchased a ...,kevin and bridget were amazing i purchased a u...,kevin bridget amazing purchased used tesla abs...,amazing purchased used tesla abs picked rockli...,amazing purchased used tesla ab picked rocklin...
3,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678,do you love having your car held hostage for t...,do you love having your car held hostage for t...,love car held hostage things arent fault locat...,love car held hostage things arent fault locat...,love car held hostage thing arent fault locati...
4,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836,i am appalled by the poor service at this tesl...,i am appalled by the poor service at this tesl...,appalled poor service tesla location first can...,appalled poor service tesla location first can...,appalled poor service tesla location first can...


# Sentiment Analysis

In [84]:
from textblob import TextBlob

In [85]:
df["polarity"] = df["lemmatized"].apply(lambda x: TextBlob(x).sentiment[0])

In [86]:
df["subjectivity"] = df["lemmatized"].apply(lambda x: TextBlob(x).sentiment[1])

In [87]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,lowercase,Punctuation,Stopwords,Clean Review,lemmatized,polarity,subjectivity
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,"consider the tesla model 3. mine is fun, which...",consider the tesla model 3 mine is fun which b...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...,0.4,0.375
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,this is for our amazing experience with the se...,this is for our amazing experience with the se...,amazing experience service center went beyond ...,amazing experience service center went beyond ...,amazing experience service center went beyond ...,0.213542,0.458333
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819,kevin and bridget were amazing. i purchased a ...,kevin and bridget were amazing i purchased a u...,kevin bridget amazing purchased used tesla abs...,amazing purchased used tesla abs picked rockli...,amazing purchased used tesla ab picked rocklin...,0.044318,0.49375
3,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678,do you love having your car held hostage for t...,do you love having your car held hostage for t...,love car held hostage things arent fault locat...,love car held hostage things arent fault locat...,love car held hostage thing arent fault locati...,-0.066667,0.566667
4,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836,i am appalled by the poor service at this tesl...,i am appalled by the poor service at this tesl...,appalled poor service tesla location first can...,appalled poor service tesla location first can...,appalled poor service tesla location first can...,-0.039773,0.462085


In [91]:
df.drop(["lowercase", "Punctuation", "Stopwords","Clean Review", "lemmatized"], axis=1, inplace=True)

In [94]:
df.sort_values(by="polarity")

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,polarity,subjectivity
7,Had impossible time with this service center a...,40,217,4.45,20,0.5,-0.07619,0.527143
3,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678,-0.066667,0.566667
9,Horrible customer service:1) Placed order via ...,293,1571,4.327645,121,0.412969,-0.065909,0.6
6,I ended up filing a complaint against them wit...,54,322,4.981481,26,0.481481,-0.054545,0.251515
4,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836,-0.039773,0.462085
5,Dear dad can you buy me a Tesla to drive at gr...,13,56,3.384615,6,0.461538,0.0,0.0
2,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819,0.044318,0.49375
8,Typically I go to the Tesla Berkeley service c...,231,1328,4.753247,90,0.38961,0.097917,0.41875
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,0.213542,0.458333
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,0.4,0.375
