# Exploratory Data Analysis

In [12]:
import pandas as pd
import plotly.express as px

In [13]:
# Load annotated reviews DataFrame

data = pd.read_pickle("../pickled_files/annotated_reviews_df2.pkl")

In [14]:
data.head()

Unnamed: 0,text,terms,aspects,fixed_review
0,But the staff was so horrible to us.,[staff],[service],But the staff was so horrible to us.
1,"To be completely fair, the only redeeming fact...",[food],"[food, anecdotes/miscellaneous]","To be completely fair, the only redeeming fact..."
2,"The food is uniformly exceptional, with a very...","[food, kitchen, menu]",[food],"The food is uniformly exceptional, with a very..."
3,Where Gabriela personaly greets you and recomm...,[],[service],Where Gabriela personaly greets you and recomm...
4,"For those that go once and don't enjoy it, all...",[],[anecdotes/miscellaneous],"For those that go once and don't enjoy it, all..."


# Aspects value count in reviews

In [15]:
data["aspects"].value_counts()

[anecdotes/miscellaneous]                             992
[food]                                                798
[service]                                             336
[ambience]                                            234
[food, price]                                         109
[price]                                               108
[food, service]                                       106
[food, ambience]                                       48
[service, ambience]                                    40
[food, service, ambience]                              37
[food, anecdotes/miscellaneous]                        28
[anecdotes/miscellaneous, ambience]                    23
[anecdotes/miscellaneous, food]                        22
[anecdotes/miscellaneous, price]                       18
[price, food]                                          17
[anecdotes/miscellaneous, service]                     13
[service, food]                                        13
[service, anec

# Fixing value count for aspects
There are reviews which contains content about multiple aspects. The code below adds those cases to separate aspects

In [16]:
anecdote=0
food = 0
service = 0
ambience = 0
price = 0

for i in data["aspects"]:
    for j in i:
        if j =="anecdotes/miscellaneous":
            anecdote+=1
        elif j=="food":
            food+=1
        elif j=="service":
            service+=1
        elif j== "price":
            price+=1
        else:
            ambience +=1

In [17]:
anecdote, food, service, ambience, price

(1133, 1233, 597, 432, 319)

# Create new DataFrame using aspect names and fixed values

In [18]:
names = ["anecdotes/miscellaneous", "food", "service", "ambience", "price"]
values =[anecdote, food, service, ambience, price]

new_data = pd.DataFrame()
new_data["names"] = names
new_data["values"] = values

In [19]:
new_data.head()

Unnamed: 0,names,values
0,anecdotes/miscellaneous,1133
1,food,1233
2,service,597
3,ambience,432
4,price,319


In [None]:
fig = px.bar(new_data, x='names', y='values',
			 hover_data=['values'], color='values',
			 title = "Review Frequency for separate aspects",
			 labels={'values':'Review Count', 'names': 'aspects'}, height=400)

fig.show()

In [None]:
fig = px.pie(new_data, values = 'values', names ='names', color='values')
fig.update_layout(title = "Aspects distribution in total reviews")
fig.show()