In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import spacy
from textblob import TextBlob
from wordcloud import WordCloud

import plotly
import plotly.express as px
import plotly.graph_objects as go

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
import dash_bootstrap_components as dbc
from dash.dependencies import Input, Output

In [None]:
def review_scaler(rating):
    if rating > 5:
        rating = 5
    else:
        rating = rating
        
    return rating

def hotel_type(rating):
    if rating == int(5):
        value = "Great"
    elif rating == int(4):
        value = "Good"
    elif rating== 3:
        value = "Neutral"
    elif rating == 2:
        value = "Bad"
    else:
        value = "Worse"
    
    return value

def year_extraction(date):
    split_date = date.split("-")
    year = split_date[0]
    
    return year

def clean_text(text):
    text = text.lower()
    text = re.sub("[*\xa0*]", " ", text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("[^a-zA-Z ]", "", text)
    
    return text

def get_language(review):
    try:
        language = detect(review)
    except LangDetectException:
        language = "Unidentified"
    
    return language

def get_word_counts(corpus, stopwords, ngram):
    vec = CountVectorizer(stop_words = stopwords, ngram_range = (ngram, ngram)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    return words_freq

In [None]:
path = "C:/Users/manis/Desktop/PythonRepositories/Hotel_Reviews"
os.chdir(path)

df = pd.read_csv("Datafiniti_Hotel_Reviews.csv")

df = df[["address", "categories", "city", "country", "latitude", "longitude", "name", 
         "postalCode", "province", "reviews.date", "reviews.rating", "reviews.text"]]

df.columns = ["Address", "Categories", "City", "Country", "Latitude", "Longitude", 
              "Name", "PostalCode", "Province", "Date", "Ratings", "Reviews"]

df = df.dropna()

df["Language"] = df["Reviews"].apply(get_language)
df = df[df["Language"] == "en"]

df["Ratings"] = df["Ratings"].astype(int)
df["Ratings"] = df["Ratings"].apply(review_scaler)

df["Clean_Reviews"] = df["Reviews"].apply(clean_text)

df["Year"] = df["Date"].apply(year_extraction)

df['Polarity'] = df['Reviews'].map(lambda text: TextBlob(text).sentiment.polarity)
df['Character_Count'] = df['Reviews'].astype(str).apply(len)
df['Word_Count'] = df['Reviews'].apply(lambda x: len(str(x).split()))

In [None]:
df = pd.read_csv("new.csv")
df.head()

In [None]:
drop_down_options = [{'label': i, 'value': i} for i in df["Province"].unique()]
drop_down_options.append({"label" : "All", "value" : "All"})

In [None]:
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])

controls = dbc.Card(
    [
        dbc.FormGroup(
            [
                dbc.Label("State", style = {"color" : "white"}),
                dcc.Dropdown(
                    options=drop_down_options,
                    value='All',
                    id="dropdown-filter",
                ) 
            ]
        ),
        dbc.FormGroup(
            [
                dbc.Label("Rating", style = {"color" : "white"}),
                dbc.RadioItems(
                    options=[{'label' : '1', 'value' : 1},
                             {'label' : '2', 'value' : 2},
                             {'label' : '3', 'value' : 3},
                             {'label' : '4', 'value' : 4},
                             {'label' : '5', 'value' : 5},
                             {'label' : 'All', 'value' : "All"}
                            ],
                    value = "All",
                    id = "radioitems-inline-input",
                    inline = True,
                    style = {"color" : "white"}
                )
            ]
        )  
    ],
    body=True,
    style={"marginRight":"0", "backgroundColor" : "#111111"}
)

maps = dbc.Card(
    [
        html.Div(
            [
                dbc.FormGroup(
                    [

                        dcc.Graph(id = "bubble-graph", config = {'displayModeBar': False})
                    ]
                )
            ],
        ),
        html.Div(
            [
                dbc.FormGroup(
                    [

                        dcc.Graph(id = "mini-graph", config = {'displayModeBar': False})
                    ]
                )
            ],
        )
    ],
    body=True,
    style = {"backgroundColor" : "#111111"}
)

hotel_table = dbc.Card(
    [
        html.H6("Top Rated Hotels", style = {"text-align" : "center", "color" : "white"}),
        dbc.FormGroup(
            [
                dash_table.DataTable(
                    id = "data-table", 
                    columns = [{"name" : "Hotel", "id" : "Name"},
                               {"name" : "Average Rating", "id" : "Average_Rating"}],
                    style_header={'backgroundColor': 'rgb(30, 30, 30)'},
                    style_table = {'height': '300px', 'width' : '438px'},
                    style_cell = {'textAlign' : 'center', 
                                  'backgroundColor': 'rgb(50, 50, 50)',
                                  'color': 'white'}
                )
            ]
        ),
    ],
    body = True,
    style={"height": "16rem", "backgroundColor" : "#111111"}
)

count = dbc.Card(
    [
        dbc.FormGroup(
            [
                html.H5("Number of Reviews", style = {"text-align" : "center", "color" : "white"}),
                html.Hr(),
                html.H1(id = "review-count", style = {"text-align" : "center", "color" : "white"})
            ]
        )
    ],
    body = True,
    style={"height": "10rem", "backgroundColor" : "#111111"}
)

mean = dbc.Card(
    [
        dbc.FormGroup(
            [
                html.H5("Average Rating", style = {"text-align" : "center", "color" : "white"}),
                html.Hr(),
                html.H1(id = "mean-rating", style = {"text-align" : "center", "color" : "white"})
            ]
        )
    ],
    body = True,
    style={"height": "10rem", "backgroundColor" : "#111111"}
)

gram_filter1 = dbc.Card(
    [
        dbc.FormGroup(
            [
                html.H5("N-Gram Analysis"),
                dbc.Label("Choose Gram"),
                dbc.RadioItems(
                    options=[{'label' : 'Uni-Gram', 'value' : 1},
                             {'label' : 'Bi-Gram', 'value' : 2},
                             {'label' : 'Tri-Gram', 'value' : 3},
                             {'label' : 'Quad-Gram', 'value' : 4},
                            ],
                    value = 1,
                    id = "gram-radio",
                    inline = False
                )
            ]
        ),
        dbc.FormGroup(
            [
                dbc.Label("Choose Stopword Option"),
                dbc.RadioItems(
                    options=[{'label' : 'Without Stopwords', 'value' : 'english'},
                             {'label' : 'With Stopwords', 'value' : None},
                            ],
                    value = 'english',
                    id = "stopword-radio",
                    inline = True
                )
            ]
        )
    ],
    body = True,
)


gram_cloud = dbc.Card(
    [
        dbc.FormGroup(
            [
                html.H5("Word Cloud", style = {"text-align" : "center"}),
                html.Img(id = "word-cloud", style = {"padding" : "10px 0px 0px 15px"})
            ]
        )
    ],
    body = True,
    style={"height": "19.6rem"}
)

gram_bar = dbc.Card(
    [
        dbc.FormGroup(
            [
                html.H5("Frequency Bar Chart", style = {"text-align" : "center"}),
                dcc.Graph(id = "freq-bar", config = {'displayModeBar': False})
            ]
        )
    ],
    body = True,
    style={"height": "19.6rem"}
)

gram_filter2 = dbc.Card(
    [
        dbc.FormGroup(
            [
                html.H5("N-Gram Analysis"),
                dbc.Label("Choose Gram"),
                dbc.RadioItems(
                    options=[{'label' : 'Uni-Gram', 'value' : 1},
                             {'label' : 'Bi-Gram', 'value' : 2},
                             {'label' : 'Tri-Gram', 'value' : 3},
                             {'label' : 'Quad-Gram', 'value' : 4},
                            ],
                    value = 1,
#                     id = "gram-radio",
                    inline = False
                )
            ]
        )
    ],
    body = True,
)


app.layout = dbc.Container(
    [
        html.H1("Hotel Review Analysis", style = {"text-align" : "center", "color" : "white"}),
        html.Hr(),
        dbc.Row(
            [
                dbc.Col([
                    html.Div([controls]),
                    html.Br(),
                    html.Div([hotel_table]),
                    html.Br(),
                    html.Div([
                        html.Div(count, style={'width': '47%', 'display': 'inline-block'}),
                        html.Div(mean, style={'width': '53%', 'display': 'inline-block', "padding" : "0px 0px 0px 30px"})
                             ])
                        ], 
                        width = 4
                ),
                dbc.Col([maps], width=8)
            ], 
        ),
        html.Br(),
        dbc.Row(
            [
                dbc.Col([gram_filter1], width=3),
                dbc.Col([gram_cloud], width = 4),
                dbc.Col([gram_bar], width = 5)
            ]
        ),
        html.Br(),
        dbc.Row(
            [
                dbc.Col([gram_filter2], width=2)
            ]
        ),
        html.Br(),
        dbc.Row(
            [
                dbc.Col([gram_filter2], width=2)
            ]
        ),
        html.Br(),
        dbc.Row(
            [
                dbc.Col([gram_filter2], width=2)
            ]
        )
    ],
    fluid = True,
    style = {"background-color" : "#323233"}
)

@app.callback(
    Output('bubble-graph', 'figure'),
    [Input('dropdown-filter', 'value'),
     Input('radioitems-inline-input', 'value')])
def update_geomap(selected_state, selected_rating):
    if selected_state == "All" and selected_rating == "All":
        filtered_df = df.groupby(["Latitude", "Longitude", "Name"]).agg(Average_Rating = pd.NamedAgg(column = "Ratings", aggfunc = "mean")).reset_index()
        filtered_df["Average_Rating"] = filtered_df["Average_Rating"].astype(int)
        filtered_df["Type"] = filtered_df["Average_Rating"].apply(hotel_type)
    elif selected_state == "All" and selected_rating != "All":
        filtered_df = df.groupby(["Latitude", "Longitude", "Name"]).agg(Average_Rating = pd.NamedAgg(column = "Ratings", aggfunc = "mean")).reset_index()
        filtered_df["Average_Rating"] = filtered_df["Average_Rating"].astype(int)
        filtered_df["Type"] = filtered_df["Average_Rating"].apply(hotel_type)
        filtered_df = filtered_df[filtered_df["Average_Rating"] == selected_rating]
    elif selected_state != "All" and selected_rating == "All":
        filtered_df = df[df["Province"] == selected_state]
        filtered_df = filtered_df.groupby(["Latitude", "Longitude", "Name"]).agg(Average_Rating = pd.NamedAgg(column = "Ratings", aggfunc = "mean")).reset_index()
        filtered_df["Average_Rating"] = filtered_df["Average_Rating"].astype(int)
        filtered_df["Type"] = filtered_df["Average_Rating"].apply(hotel_type)
    else:
        filtered_df = df[df["Province"] == selected_state]
        filtered_df = filtered_df.groupby(["Latitude", "Longitude", "Name"]).agg(Average_Rating = pd.NamedAgg(column = "Ratings", aggfunc = "mean")).reset_index()
        filtered_df["Average_Rating"] = filtered_df["Average_Rating"].astype(int)
        filtered_df["Type"] = filtered_df["Average_Rating"].apply(hotel_type)
        filtered_df = filtered_df[filtered_df["Average_Rating"] == selected_rating] 
        
    fig = px.scatter_geo(filtered_df, lat= "Latitude", lon= "Longitude", 
                             scope = 'usa', color = "Type", size="Average_Rating",
                             opacity = 0.7, size_max = 10, template = 'plotly_dark')
    fig.update_layout(autosize=False, width=900, height=400, margin=dict(l=50, r=0, b=0, t=0, pad=20))
    
    return fig   

@app.callback(
    Output('mini-graph', 'figure'),
    [Input('dropdown-filter', 'value')])
def upate_minimaps(selected_state):
    if selected_state == "All":
        filtered_df = df.copy()
    else:
        filtered_df = df[df["Province"] == selected_state]
        
    filtered_df = filtered_df.groupby(["Latitude", "Longitude", "Name"]).agg(Average_Rating = pd.NamedAgg(column = "Ratings", aggfunc = "mean")).reset_index()
    filtered_df["Average_Rating"] = filtered_df["Average_Rating"].astype(int)
    data = []
    layout = dict()
    xmin = float(0.0)
    xmax = float(0.2)
    for i in range(1,6):
        plot_df = filtered_df[filtered_df["Average_Rating"] == i]
        mapdata = dict(
            type = "scattergeo",
            lon = plot_df['Longitude'],
            lat = plot_df['Latitude'],
            mode = 'markers',
            name = str(plot_df["Average_Rating"]),
            marker = dict(
                size = 5,
                opacity = 0.8,
                reversescale = True,
                autocolorscale = False,
                symbol = 'circle',
                line = dict(width=1, color='white')
            ),
            opacity = 0.7,
            geo = "geo" + str(i),
        )
        data.append(mapdata)
        layout["geo" + str(i)] = dict(scope='usa', domain = dict( x = [xmin, xmax], y = [0.0, 1.0]))
        xmin += float(0.2)
        xmax += float(0.2)

    fig = go.Figure(data = data, layout = layout)
    fig.update_layout(template = 'plotly_dark',height = 189, width = 950,
                  margin=dict(l=0, r=0, b=0, t=0, pad=0), showlegend = False
                 )
    return fig

@app.callback(
    Output('data-table', 'data'),
    [Input('dropdown-filter', 'value')])
def update_datatabel(selected_state):
    if selected_state == "All":
        filtered_df = df.groupby("Name").agg(Average_Rating = pd.NamedAgg(column = "Ratings", aggfunc = "mean")).reset_index()
        filtered_df = filtered_df[["Name", "Average_Rating"]].head()
        data = filtered_df.to_dict('rows')
    else:
        filtered_df = df[df["Province"] == selected_state]
        filtered_df = filtered_df.groupby("Name").agg(Average_Rating = pd.NamedAgg(column = "Ratings", aggfunc = "mean")).reset_index()
        filtered_df = filtered_df.sort_values("Average_Rating", ascending = False)
        filtered_df = filtered_df[["Name", "Average_Rating"]].head()
        data = filtered_df.to_dict('rows')
        
    return data

@app.callback(
    Output('review-count', 'children'),
    [Input('dropdown-filter', 'value')])
def update_count(selected_state):
    if selected_state == "All":
        count = len(df)
    else:
        filtered_df = df[df["Province"] == selected_state]
        count = len(filtered_df)
    
    return count

@app.callback(
    Output('mean-rating', 'children'),
    [Input('dropdown-filter', 'value')])
def update_count(selected_state):
    if selected_state == "All":
        mean = df["Ratings"].mean()
    else:
        filtered_df = df[df["Province"] == selected_state]
        mean = filtered_df["Ratings"].mean()
    
    return int(mean)

@app.callback(
    [Output('word-cloud', 'src'),
     Output('freq-bar', 'figure')],
    [Input('stopword-radio', 'value'),
     Input('gram-radio', 'value')])
def update_cloud(selected_stopword, selected_gram):
    common_words = get_word_counts(df['Reviews'], selected_stopword, selected_gram)
    
    frequencies = {}
    for i in common_words:
        key, value = i 
        frequencies[key] = value
        
    if selected_stopword == "english":
        wordcloud = WordCloud(background_color = "black").generate_from_frequencies(frequencies=frequencies)
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")

        cloud = "static/cloud_without" + str(selected_gram) + ".jpeg"
        wordcloud.to_file(cloud)
    else:
        wordcloud = WordCloud(background_color = "black").generate_from_frequencies(frequencies=frequencies)
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")

        cloud = "static/cloud_with" + str(selected_gram) + ".jpeg"
        wordcloud.to_file(cloud)
    
    frequency_df = pd.DataFrame(list(frequencies.items()),columns = ['Word','Count'])
    fig = px.bar(frequency_df[0:10], x = "Word", y = "Count")
    fig.update_layout(autosize=False, width=550, height=200, margin=dict(l=0, r=0, b=0, t=0, pad=0),
                      paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
    
    return cloud, fig

if __name__ == '__main__':
     app.run_server()