#Web Scrapping

In [None]:
import pandas as pd
import numpy as np

In [None]:
import requests

In [None]:
from bs4 import BeautifulSoup

In [None]:
#pd.set_option('display.max_rows', 5000)

In [None]:
r = requests.get('https://www.yelp.com/biz/tesla-san-francisco?osq=Tesla+Dealership')

In [None]:
r.status_code

200

In [None]:
r.text

'<!DOCTYPE html><html lang="en-US" prefix="og: http://ogp.me/ns#" style="margin: 0;padding: 0; border: 0; font-size: 100%; font: inherit; vertical-align: baseline;"><head><script>document.documentElement.className=document.documentElement.className.replace(/\x08no-js\x08/,"js");</script><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><meta http-equiv="Content-Language" content="en-US" /><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><link rel="mask-icon" sizes="any" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/b2bb2fb0ec9c/assets/img/logos/yelp_burst.svg" content="#FF1A1A"><link rel="shortcut icon" href="https://s3-media0.fl.yelpcdn.com/assets/srv0/yelp_large_assets/dcfe403147fc/assets/img/logos/favicon.ico"><script> window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;window.ygaPageStartTime=new Date().getTime();</script><script>\n            window.yelp = window.yelp || {};\

In [None]:
soup = BeautifulSoup(r.text,'html.parser')

In [None]:
p = soup.findAll(class_ = "comment__09f24__D0cxf yelp-emotion-19a3fsb")

In [None]:
p

[<p class="comment__09f24__D0cxf yelp-emotion-19a3fsb"><span class="raw__09f24__T4Ezm" lang="en">Staff is very professional and courteous.  Once you get inside, it's beautiful and spacious.  The waiting room has free drinks, coffee and snacks.  Plenty of tables and couches to relax or get on your iPad:  <br/><br/>I would give them 5 stars, but I'm. It crazy about the location on Van Ness.</span></p>,
 <p class="comment__09f24__D0cxf yelp-emotion-19a3fsb"><span class="raw__09f24__T4Ezm" lang="en">Came here to get warranty work done as my driver side headlight went bad.  Came in 30min early to my appointment in the morning and the service guys greeted me quickly and took me in early no issues.  They told me it would take roughly an hour to get the job done. The work ended up taking 55min exactly so pretty accurate estimate.  Got my car back with everything working and not a single scratch or dent from service guys.  A+ work thank you!</span></p>,
 <p class="comment__09f24__D0cxf yelp-emo

In [None]:
reviews = []
for div in p:
  reviews.append(div.find('span').text)

In [None]:
reviews

["Staff is very professional and courteous.  Once you get inside, it's beautiful and spacious.  The waiting room has free drinks, coffee and snacks.  Plenty of tables and couches to relax or get on your iPad:  I would give them 5 stars, but I'm. It crazy about the location on Van Ness.",
 'Came here to get warranty work done as my driver side headlight went bad.  Came in 30min early to my appointment in the morning and the service guys greeted me quickly and took me in early no issues.  They told me it would take roughly an hour to get the job done. The work ended up taking 55min exactly so pretty accurate estimate.  Got my car back with everything working and not a single scratch or dent from service guys.  A+ work thank you!',
 'I was told I would get an IRS tax rebate, even thought I told the salesmen my income.  I cross-verified that information with another salesman who said, "just write it in and you will get the rebate."  I had told them my annual income, and that I wold only bu

In [None]:
reviews[0]

"Staff is very professional and courteous.  Once you get inside, it's beautiful and spacious.  The waiting room has free drinks, coffee and snacks.  Plenty of tables and couches to relax or get on your iPad:  I would give them 5 stars, but I'm. It crazy about the location on Van Ness."

#Analysing The Data

In [None]:
df = pd.DataFrame(np.array(reviews), columns=["review"])

In [None]:
df.head()

Unnamed: 0,review
0,Staff is very professional and courteous. Onc...
1,Came here to get warranty work done as my driv...
2,"I was told I would get an IRS tax rebate, even..."
3,I am giving one star for the young man Kenny. ...
4,Helena KElon Musk!Is climbing the highest moun...


In [None]:
len(df['review'])

10

In [None]:
df['word count'] = df['review'].apply(lambda x: len(x.split()))

In [None]:
df.head()

Unnamed: 0,review,word count
0,Staff is very professional and courteous. Onc...,51
1,Came here to get warranty work done as my driv...,84
2,"I was told I would get an IRS tax rebate, even...",92
3,I am giving one star for the young man Kenny. ...,59
4,Helena KElon Musk!Is climbing the highest moun...,125


In [None]:
df['char count'] = df['review'].apply(lambda x: len(x))

In [None]:
df.head()

Unnamed: 0,review,word count,char count
0,Staff is very professional and courteous. Onc...,51,284
1,Came here to get warranty work done as my driv...,84,445
2,"I was told I would get an IRS tax rebate, even...",92,484
3,I am giving one star for the young man Kenny. ...,59,303
4,Helena KElon Musk!Is climbing the highest moun...,125,745


In [None]:
def average_words(x):
  words = x.split()
  return sum(len(word) for word in words) / len(words)

In [None]:
df['average_word_length'] = df['review'].apply(lambda x: average_words(x))

In [None]:
df.head()

Unnamed: 0,review,word count,char count,average_word_length
0,Staff is very professional and courteous. Onc...,51,284,4.509804
1,Came here to get warranty work done as my driv...,84,445,4.261905
2,"I was told I would get an IRS tax rebate, even...",92,484,4.25
3,I am giving one star for the young man Kenny. ...,59,303,4.152542
4,Helena KElon Musk!Is climbing the highest moun...,125,745,4.96


In [None]:
from nltk.corpus import stopwords

In [None]:
!pip install nltk



In [None]:
import nltk

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = stopwords.words('english')

In [None]:
len(stop_words)

179

In [None]:
df['stopword_count'] = df['review'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))

In [None]:
df['stopword_rate'] = df['stopword_count'] / df['word count']

In [None]:
df.head()

Unnamed: 0,review,word count,char count,average_word_length,stopword_count,stopword_rate
0,Staff is very professional and courteous. Onc...,51,284,4.509804,23,0.45098
1,Came here to get warranty work done as my driv...,84,445,4.261905,32,0.380952
2,"I was told I would get an IRS tax rebate, even...",92,484,4.25,47,0.51087
3,I am giving one star for the young man Kenny. ...,59,303,4.152542,30,0.508475
4,Helena KElon Musk!Is climbing the highest moun...,125,745,4.96,48,0.384


In [None]:
df.sort_values(by='stopword_rate')

Unnamed: 0,review,word count,char count,average_word_length,stopword_count,stopword_rate
1,Came here to get warranty work done as my driv...,84,445,4.261905,32,0.380952
4,Helena KElon Musk!Is climbing the highest moun...,125,745,4.96,48,0.384
6,Tesla comes with self drive as long as you hav...,42,219,4.238095,18,0.428571
0,Staff is very professional and courteous. Onc...,51,284,4.509804,23,0.45098
8,In a nutshell: Tesla sucks! I leased one of th...,136,734,4.404412,65,0.477941
3,I am giving one star for the young man Kenny. ...,59,303,4.152542,30,0.508475
2,"I was told I would get an IRS tax rebate, even...",92,484,4.25,47,0.51087
9,I waited for 25 mins and no one even acknowled...,59,278,3.728814,31,0.525424
5,I took my car to Tesla for a recall. First- th...,96,454,3.739583,51,0.53125
7,Wow! The best tesla service center I have ever...,109,531,3.880734,58,0.53211


In [None]:
df.describe()

Unnamed: 0,word count,char count,average_word_length,stopword_count,stopword_rate
count,10.0,10.0,10.0,10.0,10.0
mean,85.3,447.7,4.212589,40.3,0.473057
std,32.152588,184.841103,0.374128,15.635075,0.058883
min,42.0,219.0,3.728814,18.0,0.380952
25%,59.0,288.75,3.948686,30.25,0.434174
50%,88.0,449.5,4.244048,39.5,0.493208
75%,105.75,519.25,4.368785,50.25,0.521785
max,136.0,745.0,4.96,65.0,0.53211


#Data Cleaning

In [None]:
df['review']

0    Staff is very professional and courteous.  Onc...
1    Came here to get warranty work done as my driv...
2    I was told I would get an IRS tax rebate, even...
3    I am giving one star for the young man Kenny. ...
4    Helena KElon Musk!Is climbing the highest moun...
5    I took my car to Tesla for a recall. First- th...
6    Tesla comes with self drive as long as you hav...
7    Wow! The best tesla service center I have ever...
8    In a nutshell: Tesla sucks! I leased one of th...
9    I waited for 25 mins and no one even acknowled...
Name: review, dtype: object

In [None]:
df['lowercase'] = df['review'].apply(lambda x: " ".join(word.lower() for word in x.split()))

In [None]:
df['punctuation'] = df['lowercase'].str.replace('[^\w\s]', '')

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words('english')

In [None]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
df['stopwords'] = df['punctuation'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

In [None]:
df.head()

Unnamed: 0,review,word count,char count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,stopwords
0,Staff is very professional and courteous. Onc...,51,284,4.509804,23,0.45098,staff is very professional and courteous. once...,staff is very professional and courteous. once...,"staff professional courteous. get inside, beau..."
1,Came here to get warranty work done as my driv...,84,445,4.261905,32,0.380952,came here to get warranty work done as my driv...,came here to get warranty work done as my driv...,came get warranty work done driver side headli...
2,"I was told I would get an IRS tax rebate, even...",92,484,4.25,47,0.51087,"i was told i would get an irs tax rebate, even...","i was told i would get an irs tax rebate, even...","told would get irs tax rebate, even thought to..."
3,I am giving one star for the young man Kenny. ...,59,303,4.152542,30,0.508475,i am giving one star for the young man kenny. ...,i am giving one star for the young man kenny. ...,giving one star young man kenny. great persona...
4,Helena KElon Musk!Is climbing the highest moun...,125,745,4.96,48,0.384,helena kelon musk!is climbing the highest moun...,helena kelon musk!is climbing the highest moun...,helena kelon musk!is climbing highest mount wo...


In [None]:
pd.Series(" ".join(df['stopwords']).split()).value_counts()[:30]

tesla          11
get             9
one             7
told            6
even            5
service         5
work            4
car             4
take            4
would           4
call            3
working         3
appointment     3
say             3
back            3
buy             2
early           2
issues.         2
number          2
took            2
another         2
salesman        2
side            2
best            2
hands           2
got             2
need            2
wheel.          2
salesmen        2
going           2
Name: count, dtype: int64

In [None]:
other_stop_words = ['get', 'even', 'next', 'back', 'took','take','would','going']

In [None]:
len(other_stop_words)

8

In [None]:
df['cleaned_review'] = df['stopwords'].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))

In [None]:
pd.Series(" ".join(df['cleaned_review']).split()).value_counts()[:30]

tesla          11
one             7
told            6
service         5
work            4
car             4
call            3
appointment     3
say             3
working         3
hands           2
early           2
number          2
issues.         2
got             2
need            2
wheel.          2
side            2
salesmen        2
another         2
salesman        2
buy             2
able            2
check           2
make            2
man             2
great           2
insurance       2
best            2
person          2
Name: count, dtype: int64

In [None]:
df.head()

Unnamed: 0,review,word count,char count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,stopwords,cleaned_review
0,Staff is very professional and courteous. Onc...,51,284,4.509804,23,0.45098,staff is very professional and courteous. once...,staff is very professional and courteous. once...,"staff professional courteous. get inside, beau...","staff professional courteous. inside, beautifu..."
1,Came here to get warranty work done as my driv...,84,445,4.261905,32,0.380952,came here to get warranty work done as my driv...,came here to get warranty work done as my driv...,came get warranty work done driver side headli...,came warranty work done driver side headlight ...
2,"I was told I would get an IRS tax rebate, even...",92,484,4.25,47,0.51087,"i was told i would get an irs tax rebate, even...","i was told i would get an irs tax rebate, even...","told would get irs tax rebate, even thought to...","told irs tax rebate, thought told salesmen inc..."
3,I am giving one star for the young man Kenny. ...,59,303,4.152542,30,0.508475,i am giving one star for the young man kenny. ...,i am giving one star for the young man kenny. ...,giving one star young man kenny. great persona...,giving one star young man kenny. great persona...
4,Helena KElon Musk!Is climbing the highest moun...,125,745,4.96,48,0.384,helena kelon musk!is climbing the highest moun...,helena kelon musk!is climbing the highest moun...,helena kelon musk!is climbing highest mount wo...,helena kelon musk!is climbing highest mount wo...


In [None]:
df['cleaned_review_word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))

In [None]:
df['clean_rate'] = df['cleaned_review_word_count'] / df['word count']

In [None]:
df.head()

Unnamed: 0,review,word count,char count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,stopwords,cleaned_review,cleaned_review_word_count,clean_rate
0,Staff is very professional and courteous. Onc...,51,284,4.509804,23,0.45098,staff is very professional and courteous. once...,staff is very professional and courteous. once...,"staff professional courteous. get inside, beau...","staff professional courteous. inside, beautifu...",25,0.490196
1,Came here to get warranty work done as my driv...,84,445,4.261905,32,0.380952,came here to get warranty work done as my driv...,came here to get warranty work done as my driv...,came get warranty work done driver side headli...,came warranty work done driver side headlight ...,46,0.547619
2,"I was told I would get an IRS tax rebate, even...",92,484,4.25,47,0.51087,"i was told i would get an irs tax rebate, even...","i was told i would get an irs tax rebate, even...","told would get irs tax rebate, even thought to...","told irs tax rebate, thought told salesmen inc...",40,0.434783
3,I am giving one star for the young man Kenny. ...,59,303,4.152542,30,0.508475,i am giving one star for the young man kenny. ...,i am giving one star for the young man kenny. ...,giving one star young man kenny. great persona...,giving one star young man kenny. great persona...,29,0.491525
4,Helena KElon Musk!Is climbing the highest moun...,125,745,4.96,48,0.384,helena kelon musk!is climbing the highest moun...,helena kelon musk!is climbing the highest moun...,helena kelon musk!is climbing highest mount wo...,helena kelon musk!is climbing highest mount wo...,74,0.592


#Lemmatization

In [None]:
!pip install textblob



In [None]:
!pip install nltk



In [None]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [None]:
!pip show textblob

Name: textblob
Version: 0.17.1
Summary: Simple, Pythonic text processing. Sentiment analysis, part-of-speech tagging, noun phrase parsing, and more.
Home-page: https://github.com/sloria/TextBlob
Author: Steven Loria
Author-email: sloria1@gmail.com
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: nltk
Required-by: 


In [None]:
import textblob
dir(textblob)

['Blobber',
 'PACKAGE_DIR',
 'Sentence',
 'TextBlob',
 'Word',
 'WordList',
 '__all__',
 '__author__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__license__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_text',
 'base',
 'blob',
 'compat',
 'decorators',
 'en',
 'exceptions',
 'inflect',
 'mixins',
 'np_extractors',
 'os',
 'parsers',
 'sentiments',
 'taggers',
 'tokenizers',
 'translate',
 'utils']

In [None]:
!pip install --upgrade textblob

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/626.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/626.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m626.3/626.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: textblob
  Attempting uninstall: textblob
    Found existing installation: textblob 0.17.1
    Uninstalling textblob-0.17.1:
      Successfully uninstalled textblob-0.17.1
Successfully installed textblob-0.18.0.post0


In [None]:
from textblob import Word

In [None]:
df['lemmatized'] = df['cleaned_review'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [None]:
df.head()

Unnamed: 0,review,word count,char count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,stopwords,cleaned_review,cleaned_review_word_count,clean_rate,lemmatized
0,Staff is very professional and courteous. Onc...,51,284,4.509804,23,0.45098,staff is very professional and courteous. once...,staff is very professional and courteous. once...,"staff professional courteous. get inside, beau...","staff professional courteous. inside, beautifu...",25,0.490196,"staff professional courteous. inside, beautifu..."
1,Came here to get warranty work done as my driv...,84,445,4.261905,32,0.380952,came here to get warranty work done as my driv...,came here to get warranty work done as my driv...,came get warranty work done driver side headli...,came warranty work done driver side headlight ...,46,0.547619,came warranty work done driver side headlight ...
2,"I was told I would get an IRS tax rebate, even...",92,484,4.25,47,0.51087,"i was told i would get an irs tax rebate, even...","i was told i would get an irs tax rebate, even...","told would get irs tax rebate, even thought to...","told irs tax rebate, thought told salesmen inc...",40,0.434783,"told irs tax rebate, thought told salesman inc..."
3,I am giving one star for the young man Kenny. ...,59,303,4.152542,30,0.508475,i am giving one star for the young man kenny. ...,i am giving one star for the young man kenny. ...,giving one star young man kenny. great persona...,giving one star young man kenny. great persona...,29,0.491525,giving one star young man kenny. great persona...
4,Helena KElon Musk!Is climbing the highest moun...,125,745,4.96,48,0.384,helena kelon musk!is climbing the highest moun...,helena kelon musk!is climbing the highest moun...,helena kelon musk!is climbing highest mount wo...,helena kelon musk!is climbing highest mount wo...,74,0.592,helena kelon musk!is climbing highest mount wo...


# Sentiment Analysis

In [None]:
from textblob import TextBlob

In [None]:
#Based on positive ir negatice review
df['polarity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[0])

In [None]:
#based on facts
df['subjectivity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[1])

In [None]:
df.drop(['lowercase','punctuation','stopwords','cleaned_review','lemmatized'],axis = 1, inplace=True)

In [None]:
df.sort_values(by = 'polarity')

Unnamed: 0,review,word count,char count,average_word_length,stopword_count,stopword_rate,cleaned_review_word_count,clean_rate,polarity,subjectivity
8,In a nutshell: Tesla sucks! I leased one of th...,136,734,4.404412,65,0.477941,68,0.5,-0.2025,0.545833
5,I took my car to Tesla for a recall. First- th...,96,454,3.739583,51,0.53125,42,0.4375,-0.1475,0.556667
9,I waited for 25 mins and no one even acknowled...,59,278,3.728814,31,0.525424,26,0.440678,-0.081818,0.45
1,Came here to get warranty work done as my driv...,84,445,4.261905,32,0.380952,46,0.547619,-0.005612,0.502041
2,"I was told I would get an IRS tax rebate, even...",92,484,4.25,47,0.51087,40,0.434783,0.1,0.475
7,Wow! The best tesla service center I have ever...,109,531,3.880734,58,0.53211,47,0.431193,0.203958,0.379167
6,Tesla comes with self drive as long as you hav...,42,219,4.238095,18,0.428571,22,0.52381,0.225,0.7
0,Staff is very professional and courteous. Onc...,51,284,4.509804,23,0.45098,25,0.490196,0.27,0.76
4,Helena KElon Musk!Is climbing the highest moun...,125,745,4.96,48,0.384,74,0.592,0.352778,0.674074
3,I am giving one star for the young man Kenny. ...,59,303,4.152542,30,0.508475,29,0.491525,0.5,0.4125
