In [64]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

In [65]:
# Define the base URL
Dunkin_url = "https://www.consumeraffairs.com/food/dunkin_donuts.html"
Starbucks_url = "https://www.consumeraffairs.com/food/starbucks.html"
Ihop_url = "https://www.consumeraffairs.com/food/ihop.html"
Waffle_url = "https://www.consumeraffairs.com/food/waffle_house.html"

In [66]:
# Create an empty list to store all review
all_pages_reviews =[]

In [67]:
# Create a Scraper function
def scraper():
	# Web scraping - fetching the reviews from the webpage using BeautifulSoup

	# loop through a range of page numbers 
	for i in range(1,6): # fetching reviews from five pages

		# Creating an empty list to store the reviews of each page
		pagewise_reviews = [] 

		# Query parameter
		query_parameter = "?page="+str(i)

		# Constructing the URL
		url = Ihop_url + query_parameter

		
		# Send HTTP request to the URL
		response = requests.get(url)

		# Create a soup object and parse the HTML page
		soup = bs(response.content, 'html.parser') 

		# Finding all the elements having reviews using class attribute
		rev_div = soup.findAll("div",attrs={"class","rvw-bd"}) 

		# loop through all the divs and append 
		for j in range(len(rev_div)):
			# finding all the p tags to fetch only the review text
			pagewise_reviews.append(rev_div[j].find("p").text)

		# writing all the reviews into a list
		for k in range(len(pagewise_reviews)):
			all_pages_reviews.append(pagewise_reviews[k]) 

	# return the final list of reviews
	return all_pages_reviews

# Driver code
reviews = scraper()

# Storing in a dataframe
i = range(1, len(reviews)+1)
reviews_df = pd.DataFrame({'review':reviews}, index=i)

# Writing to a text file
reviews_df.to_csv('reviews.txt', sep='\t')

In [68]:
reviews_df

Unnamed: 0,review
1,Me and my family went to IHOP in Temple Terrac...
2,Went to IHOP at 12755 SW 88 street in Miami Fl...
3,Rude management. Went to pick up an order was ...
4,We waited for our food for a long time. My fia...
5,Our son took us out to eat as a gift to his Da...
...,...
86,This is the only local restaurant where I feel...
87,The IHOP on Staten Island shut down because it...
88,The IHOP I go to is real good as far as gettin...
89,They have good breakfast food but are overpric...


Data Cleaning

In [69]:
import re
from nltk.corpus import stopwords

In [70]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [71]:
stop = stopwords.words('english')
reviews_df['review'] = reviews_df['review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
reviews_df['review'].head(20)

1     Me family went IHOP Temple Terrace, Florida. W...
2     Went IHOP 12755 SW 88 street Miami Florida eve...
3     Rude management. Went pick order told I leave ...
4     We waited food long time. My fiancee hash brow...
5     Our son took us eat gift Daddy. We arrived noo...
6     My family I ate IHOP location 2250 E Lincoln A...
7     It horrific service planet earth. I certain I ...
8     After asked wait text seated, waiting 45 minut...
9     After waiting 15 minutes even seated waited an...
10    Every time I get food IHOP I always get sick. ...
11    I purchased meal family Uber Eats. Now I know ...
12    We got seated right away nice came order took ...
13    Poor service, dirty floor, practicing social d...
14                                     Got order 3 hrs.
15    I always loved place last time disgrace, waite...
16    I signed free stack pancakes 10/31/20. In IHOP...
17    The IHOP Bellmead, TX place I would recommend....
18    I Ihop last night October 21, 2020 Culpepe

In [72]:
reviews_df['review'] = reviews_df['review'].str.replace('[^\w\s]','')

  reviews_df['review'] = reviews_df['review'].str.replace('[^\w\s]','')


In [73]:
reviews_df.head()

Unnamed: 0,review
1,Me family went IHOP Temple Terrace Florida We ...
2,Went IHOP 12755 SW 88 street Miami Florida eve...
3,Rude management Went pick order told I leave f...
4,We waited food long time My fiancee hash brown...
5,Our son took us eat gift Daddy We arrived noon...


Vader

In [74]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
SIA = SentimentIntensityAnalyzer()



In [75]:
list = []
for row in reviews_df['review']:
    vs = SIA.polarity_scores(row)
    list.append(vs)

df = pd.DataFrame(list)
df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.21,0.71,0.08,-0.7506
1,0.036,0.867,0.098,0.6808
2,0.165,0.696,0.14,-0.0335
3,0.088,0.912,0.0,-0.4019
4,0.069,0.832,0.099,0.3612


In [76]:
df_desc = pd.concat([reviews_df.reset_index(drop=True), df], axis=1)

In [77]:
df_desc.head(20)

Unnamed: 0,review,neg,neu,pos,compound
0,Me family went IHOP Temple Terrace Florida We ...,0.21,0.71,0.08,-0.7506
1,Went IHOP 12755 SW 88 street Miami Florida eve...,0.036,0.867,0.098,0.6808
2,Rude management Went pick order told I leave f...,0.165,0.696,0.14,-0.0335
3,We waited food long time My fiancee hash brown...,0.088,0.912,0.0,-0.4019
4,Our son took us eat gift Daddy We arrived noon...,0.069,0.832,0.099,0.3612
5,My family I ate IHOP location 2250 E Lincoln A...,0.202,0.753,0.046,-0.9608
6,It horrific service planet earth I certain I e...,0.176,0.677,0.147,-0.6115
7,After asked wait text seated waiting 45 minute...,0.0,0.935,0.065,0.3182
8,After waiting 15 minutes even seated waited an...,0.156,0.775,0.069,-0.8853
9,Every time I get food IHOP I always get sick I...,0.093,0.765,0.142,-0.0772


Export CSV

In [78]:
df_desc.to_csv(r'C:\\Users\\claud\\BIG DATA COURSE\\NEW TERM\\[2.01 SOCIAL DATA MINING]\\Test.csv', index = False)

# Data storage in MongoDB

In [79]:
# Data storage Libraries
import pymongo
from pymongo import MongoClient

In [80]:
# Load the dataset
data = df_desc

In [81]:
# Create new dataframe with sentiment analysis
df1= pd.DataFrame(list)
df_desc = pd.concat([reviews_df.reset_index(drop=True), df1], axis=1)
df_desc['sentiment'] = df_desc['compound'].apply(lambda score: 'positive' if score>=0.01 else 'negative' if score<=0.01 else 'neutral')



In [82]:
# Connect to MongoDB
client = MongoClient("mongodb+srv://yuxinliu:liu@cluster0.myrqi.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")    
db = client["Final_assignment"]   
collection = db["Ihop"]

In [83]:
data.reset_index(inplace=True)
data_dict = data.to_dict("records")

In [84]:
# Insert collection
collection.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x7fd70d62f940>