In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
url = requests.get("https://www.yelp.com/biz/tesla-san-francisco?osq=Tesla+Dealership")

In [4]:
url.status_code

200

In [5]:
url.text

'<!DOCTYPE html><html lang="en-US" prefix="og: http://ogp.me/ns#" style="margin: 0;padding: 0; border: 0; font-size: 100%; font: inherit; vertical-align: baseline;"><head><script>document.documentElement.className=document.documentElement.className.replace(/\x08no-js\x08/,"js");</script><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta http-equiv="Content-Language" content="en-US" /><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link rel="mask-icon" sizes="any" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/b2bb2fb0ec9c/assets/img/logos/yelp_burst.svg" content="#FF1A1A"><link rel="shortcut icon" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/b05852393ae5/assets/img/logos/favicon.ico"><script> window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;window.ygaPageStartTime=new Date().getTime();</script><script async src="https://www.google-analytics.com/ana

In [6]:
soup = BeautifulSoup(url.text, "html.parser")

In [7]:
divs = soup.findAll(class_="comment__09f24__gu0rG css-qgunke")
print(divs)

[<p class="comment__09f24__gu0rG css-qgunke"><span class="raw__09f24__T4Ezm" lang="en">Consider the Tesla Model 3. Mine is fun, which brings joy and contentment.  I look forward to each drive. <br/><br/>Over the air updates and a good autopilot system keeps the car renewable and removes some of the mind numbing heavy traffic.</span></p>, <p class="comment__09f24__gu0rG css-qgunke"><span class="raw__09f24__T4Ezm" lang="en">This is for our amazing experience with the service center.  They went above and beyond for us and it was the best repair experience we've ever had with any car.<br/><br/>We had a warranty repair on our Model Y that was going to take a long time to complete.  The shop was incredibly understanding and helpful about our kids and car seats and worked with us to plan ahead.  <br/><br/>When the day came they fixed a major issue for us 40% faster than expected, and they fixed a minor issue even though we didn't think the parts were in stock!  Finally, they were ultra flexib

In [8]:
reviews = []
for div in divs:
    reviews.append(div.find('span').text)

In [9]:
reviews[0]

'Consider the Tesla Model 3. Mine is fun, which brings joy and contentment. \xa0I look forward to each drive. Over the air updates and a good autopilot system keeps the car renewable and removes some of the mind numbing heavy traffic.'

# Analysing the data

In [10]:
import pandas as pd
import numpy as np

In [11]:
df = pd.DataFrame(np.array(reviews), columns=["review"])

In [12]:
len(df['review'])

10

In [13]:
df["word_count"] = df["review"].apply(lambda x: len(x.split()))

In [14]:
df['char_count']=df['review'].apply(lambda x: len(x))

In [15]:
def avg_word(review):
  words = review.split()
  return (sum(len(word) for word in words) / len(words))

# Calculate average words
df['avg_word'] = df['review'].apply(lambda x: avg_word(x))

In [16]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756
1,This is for our amazing experience with the se...,110,587,4.3
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219


In [17]:
!pip3 install nltk

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 18.1 MB/s eta 0:00:01
Installing collected packages: nltk
Successfully installed nltk-3.7


[nltk_data] Downloading package stopwords to /home/wsuser/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/wsuser/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /home/wsuser/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [18]:
# Import stopwords
from nltk.corpus import stopwords 

In [19]:
stop_words = stopwords.words('english')
df['stopword_count'] = df['review'].apply(lambda x: len([x for x in x.split() if x in stop_words]))

In [20]:
df.describe()

Unnamed: 0,word_count,char_count,avg_word,stopword_count
count,10.0,10.0,10.0,10.0
mean,78.9,419.9,4.291305,33.6
std,41.364101,218.327201,0.461689,19.132288
min,13.0,56.0,3.384615,6.0
25%,54.75,312.25,4.148282,21.75
50%,66.0,337.5,4.277119,26.5
75%,118.25,612.5,4.593144,52.75
max,134.0,703.0,4.981481,58.0


In [21]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14
1,This is for our amazing experience with the se...,110,587,4.3,49
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496,54
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298,27
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219,21


In [22]:
df["stopword_rate"] = df["stopword_count"] /df["word_count"]

In [23]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496,54,0.446281
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298,27,0.473684
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219,21,0.287671


In [24]:
df.sort_values(by="stopword_rate")

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219,21,0.287671
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463
5,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678
7,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496,54,0.446281
6,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819
8,Dear dad can you buy me a Tesla to drive at gr...,13,56,3.384615,6,0.461538
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298,27,0.473684
9,I ended up filing a complaint against them wit...,54,322,4.981481,26,0.481481


# Data Cleaning

In [25]:
df["lowercase"] = df["review"].apply(lambda x: " ".join(word.lower() for word in x.split()))

In [26]:
df["Punctuation"] = df["lowercase"].str.replace('[^\w\s]',"")

  df["Punctuation"] = df["lowercase"].str.replace('[^\w\s]',"")


In [27]:
df["Stopwords"] = df["Punctuation"].apply(lambda x:" ".join(word for word in x.split() if word not in stop_words))

In [28]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,lowercase,Punctuation,Stopwords
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,"consider the tesla model 3. mine is fun, which...",consider the tesla model 3 mine is fun which b...,consider tesla model 3 mine fun brings joy con...
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,this is for our amazing experience with the se...,this is for our amazing experience with the se...,amazing experience service center went beyond ...
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496,54,0.446281,today i went to the san francisco tesla dealer...,today i went to the san francisco tesla dealer...,today went san francisco tesla dealer first ti...
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298,27,0.473684,crappy plastic piece falls off the bottom of t...,crappy plastic piece falls off the bottom of t...,crappy plastic piece falls bottom model exposi...
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219,21,0.287671,"well, i had an issue with my tesla. took it in...",well i had an issue with my tesla took it into...,well issue tesla took service center thursday ...


In [29]:
pd.Series(" ".join(df['Stopwords']).split()).value_counts()[:30]

tesla         13
service        8
model          7
car            7
today          5
experience     5
us             4
went           4
new            4
repair         4
issue          3
amazing        3
would          3
time           3
kevin          3
location       3
bridget        3
warranty       3
get            3
didnt          3
back           3
drive          3
questions      3
poor           3
like           2
speak          2
features       2
way            2
gave           2
app            2
dtype: int64

In [30]:
other_stop_words=["us", "get", "would","go", "ap", "melissa", "kevin", "bridget", "say" ]

In [31]:
len(other_stop_words)

9

In [32]:
df["Clean Review"] = df["Stopwords"].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))

In [33]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,lowercase,Punctuation,Stopwords,Clean Review
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,"consider the tesla model 3. mine is fun, which...",consider the tesla model 3 mine is fun which b...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,this is for our amazing experience with the se...,this is for our amazing experience with the se...,amazing experience service center went beyond ...,amazing experience service center went beyond ...
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496,54,0.446281,today i went to the san francisco tesla dealer...,today i went to the san francisco tesla dealer...,today went san francisco tesla dealer first ti...,today went san francisco tesla dealer first ti...
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298,27,0.473684,crappy plastic piece falls off the bottom of t...,crappy plastic piece falls off the bottom of t...,crappy plastic piece falls bottom model exposi...,crappy plastic piece falls bottom model exposi...
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219,21,0.287671,"well, i had an issue with my tesla. took it in...",well i had an issue with my tesla took it into...,well issue tesla took service center thursday ...,well issue tesla took service center thursday ...


# Lemmatize Text

In [34]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 9.8 MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [35]:
#Import textblob
from textblob import Word

In [36]:
df["lemmatized"] = df["Clean Review"].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [37]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,lowercase,Punctuation,Stopwords,Clean Review,lemmatized
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,"consider the tesla model 3. mine is fun, which...",consider the tesla model 3 mine is fun which b...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,this is for our amazing experience with the se...,this is for our amazing experience with the se...,amazing experience service center went beyond ...,amazing experience service center went beyond ...,amazing experience service center went beyond ...
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496,54,0.446281,today i went to the san francisco tesla dealer...,today i went to the san francisco tesla dealer...,today went san francisco tesla dealer first ti...,today went san francisco tesla dealer first ti...,today went san francisco tesla dealer first ti...
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298,27,0.473684,crappy plastic piece falls off the bottom of t...,crappy plastic piece falls off the bottom of t...,crappy plastic piece falls bottom model exposi...,crappy plastic piece falls bottom model exposi...,crappy plastic piece fall bottom model exposin...
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219,21,0.287671,"well, i had an issue with my tesla. took it in...",well i had an issue with my tesla took it into...,well issue tesla took service center thursday ...,well issue tesla took service center thursday ...,well issue tesla took service center thursday ...


# Sentiment Analysis

In [38]:
from textblob import TextBlob

In [39]:
df["polarity"] = df["lemmatized"].apply(lambda x: TextBlob(x).sentiment[0])

In [40]:
df["subjectivity"] = df["lemmatized"].apply(lambda x: TextBlob(x).sentiment[1])

In [41]:
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,lowercase,Punctuation,Stopwords,Clean Review,lemmatized,polarity,subjectivity
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,"consider the tesla model 3. mine is fun, which...",consider the tesla model 3 mine is fun which b...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...,consider tesla model 3 mine fun brings joy con...,0.4,0.375
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,this is for our amazing experience with the se...,this is for our amazing experience with the se...,amazing experience service center went beyond ...,amazing experience service center went beyond ...,amazing experience service center went beyond ...,0.213542,0.458333
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496,54,0.446281,today i went to the san francisco tesla dealer...,today i went to the san francisco tesla dealer...,today went san francisco tesla dealer first ti...,today went san francisco tesla dealer first ti...,today went san francisco tesla dealer first ti...,0.369453,0.619012
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298,27,0.473684,crappy plastic piece falls off the bottom of t...,crappy plastic piece falls off the bottom of t...,crappy plastic piece falls bottom model exposi...,crappy plastic piece falls bottom model exposi...,crappy plastic piece fall bottom model exposin...,0.125,0.725
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219,21,0.287671,"well, i had an issue with my tesla. took it in...",well i had an issue with my tesla took it into...,well issue tesla took service center thursday ...,well issue tesla took service center thursday ...,well issue tesla took service center thursday ...,0.0,0.413333


In [42]:
df.drop(["lowercase", "Punctuation", "Stopwords","Clean Review", "lemmatized"], axis=1, inplace=True)

In [43]:
df.sort_values(by="polarity")

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,polarity,subjectivity
5,Do you love having your car held hostage for t...,59,309,4.254237,24,0.40678,-0.066667,0.566667
9,I ended up filing a complaint against them wit...,54,322,4.981481,26,0.481481,-0.054545,0.251515
7,I am appalled by the poor service at this Tesl...,134,696,4.171642,58,0.432836,-0.039773,0.462085
4,"Well, I had an issue with my Tesla. Took it in...",73,350,3.808219,21,0.287671,0.0,0.413333
8,Dear dad can you buy me a Tesla to drive at gr...,13,56,3.384615,6,0.461538,0.0,0.0
6,Kevin and Bridget were amazing. I purchased a ...,127,703,4.543307,57,0.448819,0.044318,0.49375
3,Crappy plastic piece falls off the bottom of t...,57,325,4.719298,27,0.473684,0.125,0.725
1,This is for our amazing experience with the se...,110,587,4.3,49,0.445455,0.213542,0.458333
2,Today I went to the San Francisco Tesla Dealer...,121,621,4.140496,54,0.446281,0.369453,0.619012
0,"Consider the Tesla Model 3. Mine is fun, which...",41,230,4.609756,14,0.341463,0.4,0.375
