# Data Cleaning & Preparation

In [46]:
## import required modules
import pandas as pd # imports pandas module
import numpy as np # imports numpy module
import matplotlib.pyplot as plt # imports matpolib pyplot module
%matplotlib inline

# import plotly modules
import plotly.plotly as py
import plotly.tools as tls
import plotly.graph_objs as go

# set plotly user credentials
tls.set_credentials_file(username='kamparia', api_key='')

# import plotly offline and required dependencies
# import plotly.offline as py
# import plotly.graph_objs as go
# from plotly.graph_objs import *

In [47]:
## load csv data
df = pd.read_csv('./data/raw/hotel_reviews/hotel_reviews.csv') # loads the data into a dataframe

In [48]:
## rename column names
old_names = ['reviews.date', 'reviews.rating', 'reviews.title', 'reviews.text'] 
new_names = ['date', 'rating', 'title', 'text']
df.rename(columns=dict(zip(old_names, new_names)), inplace=True)

In [49]:
## subset multiple columns of our dataframe
df = df[['latitude', 'longitude', 'name', 'address', 'postalCode', 'categories', 'city', 'country', 'date', 'rating', 'title', 'text',]] 

In [50]:
## drop rows with NAs'
df = df[pd.notnull(df['name'])]
df = df[pd.notnull(df['latitude'])]
df = df[pd.notnull(df['longitude'])]
df = df[pd.notnull(df['rating'])]
df = df[pd.notnull(df['date'])]

In [51]:
## export cleaned data to csv
file_name = './data/cleaned/hotel_reviews/hotel_reviews.csv'
df.to_csv(file_name, sep=',')

# Data Analysis & Exploration

### Q1. Which hotel has the lowest number of reviews

In [52]:
q1 = df.groupby('name').count().reset_index().min().iloc[0]
print(q1)

1785 Inn


### Q2. Which hotel has the highest average ratings

In [53]:
q2 = df.groupby('name')['rating'].mean().reset_index().max().iloc[0]
print(q2)

Yakima Inn


### Q3. Which City has the highest number of hotels

In [54]:
q3 = df.groupby(['country', 'city', 'name'])['rating'].mean().reset_index()
q3 = df.groupby('city').count().reset_index().max().iloc[0]
print(q3)

Yellville


### Q4. What is the relationship between total number of reviews per hotel and average rating of the hotel

In [75]:
group_name = df.groupby(['name'])['rating'].mean().reset_index()

group_count = df.groupby('name').count().reset_index()
old_names = ['latitude'] 
new_names = ['count']
group_count.rename(columns=dict(zip(old_names, new_names)), inplace=True)
group_count = group_count[['name', 'count']] 

q4 = pd.merge(group_name, group_count, left_index=True, right_index=True)[['name_x', 'rating', 'count']] 

#q4.plot.scatter(x='rating', y='count') plots using matplotlip

x = (q4['rating']).values.reshape((618,1))
y = (q4['count']).values.reshape((618,1))

# Create a trace
trace = go.Scatter(
    x = x,
    y = y,
    mode = 'markers'
)

data = [trace]

# Plot and embed in ipython notebook!
py.iplot(data, filename='hotels-reviews-scatter')

### Q5. Plot a map of the hotels using latitude and longitude

In [60]:
q5 = df.groupby(['name', 'latitude', 'longitude'])['rating'].mean().reset_index()
lat = q5.latitude
lon = q5.longitude
name = q5.name

mapbox_access_token = 'pk.eyJ1Ijoia2FtcGFyaWEiLCJhIjoib0JLTExtSSJ9.6ahf835RV3kBUnC3cQ-SnA'
data = Data([
    Scattermapbox(
        lat=lat,
        lon=lon,
        mode='markers',
        marker=Marker(
            size=17,
            color='rgb(255, 0, 0)',
            opacity=0.7
        ),
        text=name,
        hoverinfo='text'
    ),
    Scattermapbox(
        lat=lat,
        lon=lon,
        mode='markers',
        marker=Marker(
            size=8,
            color='rgb(242, 177, 172)',
            opacity=0.7
        ),
        hoverinfo='none'
    )]
)
        
layout = Layout(
    title='INTERACTIVE MAP SHOWING LOCATION OF REVIEWED HOTELS.',
    autosize=True,
    hovermode='closest',
    showlegend=False,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=38,
            lon=-94
        ),
        pitch=0,
        zoom=3,
        style='light'
    ),
)

fig = dict(data=data, layout=layout)

py.iplot(fig, filename='hotel-reviews-map')