##EDA of yelp dataset-section 2018-2019

In [None]:
# importing basic eda tools
import pandas as pd
import numpy as np
import json
import matplotlib as mp

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#time and warnings
import time
import warnings

#settings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_context('poster', font_scale=0.5)

In [None]:
#creating dataframes, based on converted .json-files - see 'jw_importing_full_review.ipynb' 

review =    pd.read_csv('/Volumes/Samsung_T5/Data_Science_BootCamp/capstone/Yelp-Capstone/data/review_1819.csv')
#business =  pd.read_csv('/Volumes/Samsung_T5/Data_Science_BootCamp/capstone/Yelp-Capstone/data/business.csv')
#users =     pd.read_csv('/Volumes/Samsung_T5/Data_Science_BootCamp/capstone/Yelp-Capstone/data/users.csv')

Starting with review

In [None]:
review.shape

In [None]:
review.info()

In [None]:
review.columns

In [None]:
review.text.sample(10)

In [None]:
review.stars.value_counts()

In [None]:
review.useful.value_counts()

In [None]:
review.funny.value_counts()

In [None]:
review.cool.value_counts()

In [None]:
review.user_id.value_counts()

In [None]:
review.business_id.value_counts()

In [None]:
review.user_id.nunique()

In [None]:
review.business_id.nunique()

In [None]:
#exploring the number of missing values per feature in percentage
print('Number of missing values: ', review.isnull().values.sum())
print('Percent of missing values per feature: ') 
review.isnull().sum() * 100 / len(review)

In [None]:
review.date = pd.to_datetime(review.date)

In [None]:
review['month'] = review.date.dt.month

In [None]:
review.info()

Plotting stuff

In [None]:
#Get the distribution of the ratings
x=review['stars'].value_counts()
x=x.sort_index()
#plot
plt.figure(figsize=(8,4))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Star Rating Distribution")
plt.ylabel('# of businesses', fontsize=12)
plt.xlabel('Star Ratings ', fontsize=12)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

In [None]:
user_agg=review.groupby('user_id').agg({'review_id':['count'],'date':['min','max'],
                                'useful':['sum'],'funny':['sum'],'cool':['sum'],
                               'stars':['mean']})

In [None]:
user_agg=user_agg.sort_values([('review_id','count')],ascending=False)
print("          Top 10 Users in Yelp")
user_agg.head(10)

In [None]:
fig,ax= plt.subplots(figsize=(15,10))
plt.yticks(fontsize=16)

years = review.year

sns.countplot(x=years,
              palette="GnBu",
              linewidth=3)

plt.title("Count of reviews per year",
          font="Serif",
          size=20,color='k')

In [None]:
fig,ax= plt.subplots(figsize=(15,10))
plt.yticks(fontsize=16)

years = review.month

sns.countplot(x=years,
              palette="GnBu",
              linewidth=3)

plt.title("Count of reviews per month",
          font="Serif",
          size=20,color='k')

Business

In [None]:
business =  pd.read_csv('/Volumes/Samsung_T5/Data_Science_BootCamp/capstone/Yelp-Capstone/data/business.csv')

In [None]:
business.shape

In [None]:
business.info()

In [None]:
business.state.value_counts()

In [None]:
business.state.nunique()

In [None]:
business.city.value_counts()

In [None]:
business.city.nunique()

In [None]:
business.categories.value_counts()

In [None]:
business.categories.nunique()

In [None]:
business.drop(columns='hours', inplace=True)

In [None]:
business.drop(columns='attributes',inplace=True)

In [None]:
business.info()

World View

In [None]:
#map section

#!pip install imageio
#!pip install folium
#!pip install mpl-toolkits.clifford
#!pip install basemap
#!pip install geos
#!python -m pip install basemap
import imageio
import folium
import folium.plugins as plugins
import Basemap

In [None]:
import folium
from folium.plugins import HeatMap,MarkerCluster

m1=folium.Map(location = (37,-95),
              max_zoom=12,min_zoom=4,zoom_start=6,
              tiles='open street map')

locations = business[['latitude','longitude']]
cluster = MarkerCluster(locations=locations,
                       ).add_to(m1)

m1

City with most reviewers

In [None]:
#Get the distribution of the ratings
x=business['city'].value_counts()
x=x.sort_values(ascending=False)
x=x.iloc[0:20]
plt.figure(figsize=(16,4))
ax = sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Which city has the most reviews?")
locs, labels = plt.xticks()
plt.setp(labels, rotation=45)
plt.ylabel('# businesses', fontsize=12)
plt.xlabel('City', fontsize=12)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

Popularity of single business = stars * review counts

In [None]:
#get all ratings data
rating_data=business[['latitude','longitude','stars','review_count']]
# Creating a custom column popularity using stars*no_of_reviews
rating_data['popularity']=rating_data['stars']*rating_data['review_count']

In [None]:
rating_data

Users

In [None]:
users =     pd.read_csv('/Volumes/Samsung_T5/Data_Science_BootCamp/capstone/Yelp-Capstone/data/users.csv')

In [None]:
users.shape

In [None]:
users.info()

In [None]:
#graph section
#!pip install networkx
import networkx as nx
import heapq  # for getting top n number of things from list,dict

In [None]:
start_time=time.time()
color = sns.color_palette()
sns.set_style("dark")
warnings.filterwarnings("ignore")

In [None]:
## More efficient way of doing the same transformations in the above cell
#subset users who have atleast one friend
subset_users=users[users['friends']!='None']
#user has given atleast 10 reviews
subset_users=subset_users[subset_users['review_count']>=10]
#subset_users=subset_users.sort_values('review_count',ascending=False)

subset_users['list_friends']=subset_users["friends"].apply(lambda x: str(x).split(','))

subset_users=subset_users[['user_id','list_friends']]
#stopping at 6k due to space constraints
subset_users=subset_users.iloc[0:6000]
res = subset_users.set_index(['user_id'])['list_friends'].apply(pd.Series).stack()


In [None]:
network_data=res.reset_index()
#checks
network_data.tail()

In [None]:
#changing the column name to suit nx import
network_data.columns=['source','level_1','target']

# Considering each (user_id,friend) pair as an edge of a graph, constructing the graph
graph=nx.from_pandas_edgelist(network_data)
# logging time
end_time=time.time()
print("Took",end_time-start_time,"s")

In [None]:
#credits https://www.kaggle.com/crailtap/basic-network-analysis-tutorial
#basic info
print(nx.info(graph))
#check density
print("The density of the graph is ",nx.density(graph))