# Review EDA

In [1]:
# Conectamos con el google drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls -a

.  ..  .config	drive  sample_data


In [3]:
# Instalamos las dependencias de nuestro entorno de trabajo 
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://apache.osuosl.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop2.7.tgz
!tar xf spark-3.1.3-bin-hadoop2.7.tgz
!pip install -q findspark
! pip install koalas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting koalas
  Downloading koalas-1.8.2-py3-none-any.whl (390 kB)
[K     |████████████████████████████████| 390 kB 7.2 MB/s 
Installing collected packages: koalas
Successfully installed koalas-1.8.2


In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.3-bin-hadoop2.7"

In [5]:
from pathlib import Path
import databricks.koalas as ks
import findspark
from datetime import datetime
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
def import_convert(path:Path, format:str): 
    data = spark.read.load(path, format=format)
    data = data.to_koalas()
    return data

In [7]:
k_review = import_convert("/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Repository/Data/review.json", "json")

## Now we start gathering some basic info about the 'review' dataset

## --------- GENERAL INFO ---------

In [None]:
k_review.columns

Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id'],
      dtype='object')

In [None]:
k_review.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,2018-07-07 22:09:11,0,KU_O5udG6zpxOg-VcAEodg,3.0,"If you decide to eat here, just be aware it is...",0,mh_-eMZ6K5RLWhZyISBhwA
1,7ATYjTIgM3jUlt4UM3IypQ,1,2012-01-03 15:28:18,0,BiTunyQ73aT9WBnpR9DZGw,5.0,I've taken a lot of spin classes over the year...,1,OyoGAe7OKpv6SyGZT5g77Q
2,YjUWPpI6HXG530lwP-fb2A,0,2014-02-05 20:30:30,0,saUsX_uimxRlCVr67Z4Jig,3.0,Family diner. Had the buffet. Eclectic assortm...,0,8g_iMtfSiwikVnbP2etR0A
3,kxX2SOes4o-D3ZQBkiMRfA,1,2015-01-04 00:01:03,0,AqPFMleE6RsU23_auESxiA,5.0,"Wow! Yummy, different, delicious. Our favo...",1,_7bHUi9Uuf5__HHc_Q8guQ
4,e4Vwtrqf-wpJfwesgvdgxQ,1,2017-01-14 20:54:15,0,Sx8TMOWLNuJBWer-0pcmoA,4.0,Cute interior and owner (?) gave us tour of up...,1,bcjbaE6dDog4jkNY91ncLQ


In [None]:
k_review.shape

(6990280, 9)

In [None]:
# Count of rows duplicated

k_review.duplicated().sum()

0

In [None]:
k_review.info()

<class 'databricks.koalas.frame.DataFrame'>
Int64Index: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   business_id  6990280 non-null  object 
 1   cool         6990280 non-null  int64  
 2   date         6990280 non-null  object 
 3   funny        6990280 non-null  int64  
 4   review_id    6990280 non-null  object 
 5   stars        6990280 non-null  float64
 6   text         6990280 non-null  object 
 7   useful       6990280 non-null  int64  
 8   user_id      6990280 non-null  object 
dtypes: float64(1), int64(3), object(5)

In [None]:
k_review.describe()

Unnamed: 0,cool,funny,stars,useful
count,6990280.0,6990280.0,6990280.0,6990280.0
mean,0.4986175,0.3265596,3.748584,1.184609
std,2.17246,1.688729,1.478705,3.253767
min,-1.0,-1.0,1.0,-1.0
25%,0.0,0.0,3.0,0.0
50%,0.0,0.0,4.0,0.0
75%,0.0,0.0,5.0,1.0
max,404.0,792.0,5.0,1182.0


In [None]:
k_review.nunique().plot(kind = 'bar', title='Cantidad de Valores únicos')

In [None]:
# Count of null per column
k_review.isnull().sum()

business_id    0
cool           0
date           0
funny          0
review_id      0
stars          0
text           0
useful         0
user_id        0
dtype: int64

In [None]:
# Count of null per column
k_review.isna().sum()

business_id    0
cool           0
date           0
funny          0
review_id      0
stars          0
text           0
useful         0
user_id        0
dtype: int64

In [None]:
k_review.max()

cool       404.0
funny      792.0
stars        5.0
useful    1182.0
dtype: float64

In [None]:
k_review.min()

cool     -1.0
funny    -1.0
stars     1.0
useful   -1.0
dtype: float64

In [None]:
k_review.mean()

cool      0.498618
funny     0.326560
stars     3.748584
useful    1.184609
dtype: float64

In [None]:
k_review.std()

cool      2.172460
funny     1.688729
stars     1.478705
useful    3.253767
dtype: float64

### Now we start a per column analysis. For this, we will define some useful functions.

In [None]:
def is_empty(value):
  '''
  Determine whether a value is an empty string or not
  '''
  stripped = value.strip()
  if stripped == '':
    return 1
  else:
    return 0

## --------- COLUMN: review_id ---------

// string, 22 characters, review's id

"review_id": "zdSx_SD6obEhz9VrW9uAWA"

In [None]:
# Total different review_id's

print(f"Number of different id's:{len(k_review.review_id.unique())}\nTotal rows: {k_review.shape[0]}")

Number of different id's:6990280
Total rows: 6990280


Now we check if all id's are in fact 22 characters strings

In [None]:
k_review['chars_in_id'] = k_review.review_id.apply(lambda id: len(id))



In [None]:
k_review.chars_in_id.value_counts()

22    6990280
Name: chars_in_id, dtype: int64

All review_id's contain 22 characters. We can drop the created column.

In [None]:
k_review = k_review.drop('chars_in_id', axis=1)
k_review.columns

Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id'],
      dtype='object')

Now we check if after dropping the review_id column there are still no duplicates

In [None]:
review_noid = k_review.drop('review_id', axis=1)
review_noid.columns

Index(['business_id', 'cool', 'date', 'funny', 'stars', 'text', 'useful',
       'user_id'],
      dtype='object')

In [None]:
review_noid.duplicated().sum()

7

There are 7 duplicated registers if we ignore their id's

## --------- COLUMN: user_id ---------

// string, 22 characters unique user id, refiers to user in user.json

"user_id": "Ha3iJu77CxlrFm-vQRs_8g"

In [None]:
# Amount of users making reviews

len(k_review.user_id.unique())

1987929

Now we will check if all user_id's are 22 char strings

In [None]:
k_review['chars_in_userid'] = k_review.user_id.apply(lambda id: len(id))



In [None]:
k_review.chars_in_userid.value_counts()

22    6990280
Name: chars_in_userid, dtype: int64

All user_id's contain 22 characters. We can drop the created column.

In [None]:
k_review = k_review.drop('chars_in_userid', axis=1)
k_review.columns

Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id'],
      dtype='object')

In [None]:
# Average amount of reviews per user

k_review.user_id.value_counts().mean()

3.51636300894046

In [None]:
# Standard deviation in the amount of reviews posted per user

k_review.user_id.value_counts().std()

12.770877900860583

In [None]:
# Top users by amount of reviews posted

k_review.user_id.value_counts().head(15)

_BcWyKQL16ndpBdggh2kNA    3048
Xw7ZjaGfr0WNVt6s_5KZfA    1840
0Igx-a1wAstiBDerGxXk2A    1747
-G7Zkl1wIWBBmD0KRy_sCw    1682
ET8n-r7glWYqZhuR6GcdNw    1653
bYENop4BuQepBjM1-BI3fA    1578
1HM81n6n4iPIFU5d2Lokhw    1554
fr1Hz2acAb3OaL3l6DyKNg    1447
wXdbkFZsfDR7utJvbWElyA    1396
Um5bfs5DH6eizgjH3xZsvg    1391
qjfMBIZpQT9DDtw_BWCopQ    1324
VL12EhEdT4OWqGq0nIqkzw    1308
bJ5FtCtZX3ZZacz2_2PJjA    1298
pou3BbKsIozfH50rxmnMew    1247
ouODopBKF3AqfCkuQEnrDg    1129
Name: user_id, dtype: int64

In [None]:
k_review.user_id.value_counts().head(50)

_BcWyKQL16ndpBdggh2kNA    3048
Xw7ZjaGfr0WNVt6s_5KZfA    1840
0Igx-a1wAstiBDerGxXk2A    1747
-G7Zkl1wIWBBmD0KRy_sCw    1682
ET8n-r7glWYqZhuR6GcdNw    1653
bYENop4BuQepBjM1-BI3fA    1578
1HM81n6n4iPIFU5d2Lokhw    1554
fr1Hz2acAb3OaL3l6DyKNg    1447
wXdbkFZsfDR7utJvbWElyA    1396
Um5bfs5DH6eizgjH3xZsvg    1391
qjfMBIZpQT9DDtw_BWCopQ    1324
VL12EhEdT4OWqGq0nIqkzw    1308
bJ5FtCtZX3ZZacz2_2PJjA    1298
pou3BbKsIozfH50rxmnMew    1247
ouODopBKF3AqfCkuQEnrDg    1129
B-s-8VUnuBjGTP3d01jsyw    1087
-kLVfaJytOJY2-QdQoCcNQ    1076
vHc-UrI9yfL_pnnc6nJtyQ    1071
CfX4sTIFFNaRchNswqhVfg    1047
AHRrG3T1gJpHvtpZ-K0G_g    1041
I2XpWCHAom1JRyHXZQrnfg    1023
ppsm8EDKjA1fp1yTCP3RrQ     978
AaJ9d4OrFmgc4S_U2QiSZg     964
zYFGMy1_thjMnvQLX6JNBw     954
PnwOegp7RXfMeNAyO9fQhQ     947
2iS1vg5TYpV_iEiNC8osTg     938
XzpJ4uHkxARCFQiZ9bffyg     927
eTvp_hYnsrI5-ow_sQ31_g     922
lYQk0R6sPfo3WeX-l_5BuA     917
vffKQc_WQMYFGY4JS5VAOw     900
6s-g2vFu12OemhiK3FJuOQ     888
lAW03ccfMEuRZiDqvVgc0w     866
0YI3p9o-

In [None]:
k_review.user_id.value_counts().head(25).plot(kind = 'bar', title='Cantidad de Valores únicos')

In [None]:
# Statistics from user with the most posts

k_review[k_review.user_id == '_BcWyKQL16ndpBdggh2kNA'].describe()

Unnamed: 0,cool,funny,stars,useful
count,3048.0,3048.0,3048.0,3048.0
mean,2.090879,1.097113,3.637795,3.623031
std,2.214004,1.554317,0.913762,3.383771
min,0.0,0.0,1.0,0.0
25%,1.0,0.0,3.0,1.0
50%,2.0,1.0,4.0,3.0
75%,3.0,2.0,4.0,5.0
max,25.0,17.0,5.0,54.0


In [None]:
# Amount of reviews written by most popular user (the one with the most fans)

len(k_review[k_review.user_id == '37cpUoM8hlkSQfReIEBd-Q'])

59

In [None]:
# Statistics from most popular user

k_review[k_review.user_id == '37cpUoM8hlkSQfReIEBd-Q'].describe()

Unnamed: 0,cool,funny,stars,useful
count,59.0,59.0,59.0,59.0
mean,2.864407,2.576271,4.220339,5.559322
std,1.80476,1.621019,0.81087,3.415337
min,0.0,1.0,2.0,1.0
25%,1.0,1.0,4.0,3.0
50%,3.0,2.0,4.0,5.0
75%,4.0,4.0,5.0,7.0
max,9.0,9.0,5.0,17.0


#### ID VALIDATION

Now we will check whether the user_id's reference a register in the 'user' dataset or not. We start by importing the 'user' table and checking if all of it's user_id's have 22 chars.

In [None]:
user = spark.read.load("/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Repository/Data/user.json", format="json")
k_user = user.to_koalas()

In [None]:
k_user['chars_in_user_id'] = k_user.user_id.apply(lambda id: len(id))



In [None]:
k_user.chars_in_user_id.value_counts()

22    1987897
Name: chars_in_user_id, dtype: int64

In [None]:
k_user = k_user.drop('chars_in_user_id', axis=1)

 We can see below that there are more unique id's in 'review' than in 'user', so we proceed to check which user_id's from 'review' are not in 'user'.

In [None]:
rev_usids = k_review.user_id.unique().to_list()

In [None]:
type(rev_usids), len(rev_usids)

(list, 1987929)

In [None]:
usr_usids = k_user.user_id.unique().to_list()

In [None]:
type(usr_usids), len(usr_usids)

(list, 1987897)

In [None]:
no_review_users = []
for x in usr_usids:
    try:
        rev_usids.remove(x)
    except:
        no_review_users.append(x)

In [None]:
not_found_users = rev_usids
len(not_found_users)

32

In [None]:
len(no_review_users)

0

In [None]:
with open("user_not_found.txt", "w") as output:
    output.write(str(not_found_users))

## --------- COLUMN: business_id ---------

// string, 22 characters business id, refers to business in business.json

"business_id": "tnhfDv5Il8EaGSXZGiuQGg"


In [None]:
# Amount of businesses being reviewed

len(k_review.business_id.unique())

150346

Now we will check if all business_id's are 22 char strings

In [None]:
# Most reviewed businesses

k_review.business_id.value_counts().head(15)

_ab50qdWOk0DdB6XOrBitw    7673
ac1AeYqs8Z4_e2X5M3if2A    7516
GXFMD0Z4jEVZBCsbPf4CTQ    6160
ytynqOUb3hjKeJfRj5Tshw    5778
oBNrLz4EDhiscSlbOl8uAw    5264
iSRTaT9WngzB8JJ2YKJUig    5254
VQcCL9PiNL_wkGf-uF3fjg    5146
_C7QiQQc47AOEv4PE3Kong    4969
GBTPC53ZrG1ZBY3DT8Mbcw    4661
6a4gLLFSgr-Q6CZXDLzBGQ    4480
PP3BBaVxZLcJU54uP_wL6Q    4293
1b5mnK8bMnnju_cvU65GqQ    4247
I_3LMZ_1m2mzR0oLIOePIg    4093
VaO-VW3e1kARkU9bP1E7Fw    4034
qb28j-FNX1_6xm7u372TZA    3971
Name: business_id, dtype: int64

In [None]:
business = spark.read.load("/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Repository/Data/business.json", format="json")
k_business = business.to_koalas()

In [None]:
# Businesses with the most reviews (by id)

ks.sql("select b.business_id, first(b.name) Business, count(r.business_id) Reviews from {k_business} b join {k_review} r on r.business_id==b.business_id group by b.business_id order by reviews desc limit 15")

Unnamed: 0,business_id,Business,Reviews
0,_ab50qdWOk0DdB6XOrBitw,Acme Oyster House,7673
1,ac1AeYqs8Z4_e2X5M3if2A,Oceana Grill,7516
2,GXFMD0Z4jEVZBCsbPf4CTQ,Hattie B’s Hot Chicken - Nashville,6160
3,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,5778
4,oBNrLz4EDhiscSlbOl8uAw,Ruby Slipper - New Orleans,5264
5,iSRTaT9WngzB8JJ2YKJUig,Mother's Restaurant,5254
6,VQcCL9PiNL_wkGf-uF3fjg,Royal House,5146
7,_C7QiQQc47AOEv4PE3Kong,Commander's Palace,4969
8,GBTPC53ZrG1ZBY3DT8Mbcw,Luke,4661
9,6a4gLLFSgr-Q6CZXDLzBGQ,Cochon,4480


In [None]:
k_review[k_review.business_id == '_ab50qdWOk0DdB6XOrBitw'].stars.mean()

4.124983709109865

In [None]:
# Businesses with the most reviews (by brand)

ks.sql("select b.name, count(r.business_id) reviews from {k_business} b join {k_review} r on r.business_id==b.business_id group by b.name order by reviews desc limit 15")

Unnamed: 0,name,reviews
0,Starbucks,21575
1,McDonald's,18210
2,Dunkin',10312
3,Chipotle Mexican Grill,9763
4,First Watch,9317
5,Taco Bell,8636
6,Acme Oyster House,8491
7,Chick-fil-A,8378
8,Panera Bread,7565
9,Oceana Grill,7516


In [None]:
k_business.columns

Index(['address', 'attributes', 'business_id', 'categories', 'city', 'hours',
       'is_open', 'latitude', 'longitude', 'name', 'postal_code',
       'review_count', 'stars', 'state'],
      dtype='object')

In [None]:
stars_mean = k_review.stars.mean()

In [None]:
stars_mean

3.74858374771826

In [None]:
ks.sql("select b.name, avg(r.stars) stars_avg, count(r.business_id) reviews from {k_business} b join {k_review} r on r.business_id==b.business_id group by b.name having reviews>50 order by stars_avg desc limit 15")

Unnamed: 0,name,stars_avg,reviews
0,Sugar Bar Salon,5.0,55
1,ByCherry Photography,5.0,73
2,Truong & Company Jeweler,5.0,61
3,Matt Glynn - Schumacher Mortgage,5.0,60
4,Waxing by Cody Marie,5.0,53
5,Jeramie Lu Photography,5.0,77
6,Delacruz Chiropractic,5.0,67
7,Gators Parasail,5.0,66
8,Twisted Twig Fine Florals,5.0,74
9,Stephen The Spectacular,5.0,53


• compliments (useful+funny+cool) by business


In [None]:
k_business.shape

(150346, 14)

In [None]:
len(k_business.name.unique())

114117

In [None]:
starbucks_count = 0
for name in k_business.name.to_numpy():
  if name.lower() == 'starbucks':
    starbucks_count += 1
print(starbucks_count)

724


In [None]:
mcdonalds_count = 0
for name in k_business.name.to_numpy():
  if name.lower() == "mcdonald's":
    mcdonalds_count += 1
print(mcdonalds_count)

703


#### ID VALIDATION

Now we will check whether the business_id's reference a register in the business dataset or not. As we have already imported the business dataset, we procceed to check if all of it's business_id's have 22 chars.

In [None]:
k_business['chars_in_business_id'] = k_business.business_id.apply(lambda id: len(id))



In [None]:
k_business['chars_in_business_id'].value_counts()

22    150346
Name: chars_in_business_id, dtype: int64

In [None]:
k_business = k_business.drop('chars_in_business_id', axis=1)

Althought there is the same amount of 'business_id' values in 'review' as in 'business' we will check if there is correspondence between these columns.

In [None]:
rev_buids = k_review.business_id.unique().to_list()
bus_buids = k_business.business_id.unique().to_list()

In [None]:
def no_review_businesses(ids_review, ids_in_dim):
  no_review_businesses = []
  for x in ids_in_dim:
    try:
      ids_review.remove(x)
    except:
      no_review_businesses.append(x)

  return ids_review, no_review_businesses

In [None]:
rev_buids, no_review_businesses = no_review_businesses(rev_buids, no_review_businesses)

In [None]:
len(rev_buids)

0

From the output obove, we can see that all business_id values correspond.

## --------- COLUMN: date ---------

// string, fecha formato YYYY-MM-DD

"date": "2016-03-09"

In [None]:
k_review['datetime'] = k_review.date.astype(datetime)

In [None]:
k_review.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,datetime
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,2018-07-07 22:09:11,0,KU_O5udG6zpxOg-VcAEodg,3.0,"If you decide to eat here, just be aware it is...",0,mh_-eMZ6K5RLWhZyISBhwA,2018-07-07 22:09:11
1,7ATYjTIgM3jUlt4UM3IypQ,1,2012-01-03 15:28:18,0,BiTunyQ73aT9WBnpR9DZGw,5.0,I've taken a lot of spin classes over the year...,1,OyoGAe7OKpv6SyGZT5g77Q,2012-01-03 15:28:18
2,YjUWPpI6HXG530lwP-fb2A,0,2014-02-05 20:30:30,0,saUsX_uimxRlCVr67Z4Jig,3.0,Family diner. Had the buffet. Eclectic assortm...,0,8g_iMtfSiwikVnbP2etR0A,2014-02-05 20:30:30
3,kxX2SOes4o-D3ZQBkiMRfA,1,2015-01-04 00:01:03,0,AqPFMleE6RsU23_auESxiA,5.0,"Wow! Yummy, different, delicious. Our favo...",1,_7bHUi9Uuf5__HHc_Q8guQ,2015-01-04 00:01:03
4,e4Vwtrqf-wpJfwesgvdgxQ,1,2017-01-14 20:54:15,0,Sx8TMOWLNuJBWer-0pcmoA,4.0,Cute interior and owner (?) gave us tour of up...,1,bcjbaE6dDog4jkNY91ncLQ,2017-01-14 20:54:15


In [None]:
min_datetime = k_review.datetime.min()
min_datetime

Timestamp('2005-02-16 03:23:22')

In [None]:
max_datetime = k_review.datetime.max()
max_datetime

Timestamp('2022-01-19 19:48:45')

In [None]:
min_datetime.time()

datetime.time(3, 23, 22)

In [None]:
max_datetime.date()

datetime.date(2022, 1, 19)

Now we separate the hour and date from the datetime column in different columns

In [None]:
k_review['date'] = k_review.datetime.apply(lambda x: x.date())



In [None]:
def get_time(datetime):
  return datetime.dt.hour()

In [None]:
k_review['hour'] = k_review.datetime.dt.hour

In [None]:
k_review.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,datetime,hour
0,XQfwVwDr-v0ZS3_CbbE5Xw,0,2018-07-07,0,KU_O5udG6zpxOg-VcAEodg,3.0,"If you decide to eat here, just be aware it is...",0,mh_-eMZ6K5RLWhZyISBhwA,2018-07-07 22:09:11,22
1,7ATYjTIgM3jUlt4UM3IypQ,1,2012-01-03,0,BiTunyQ73aT9WBnpR9DZGw,5.0,I've taken a lot of spin classes over the year...,1,OyoGAe7OKpv6SyGZT5g77Q,2012-01-03 15:28:18,15
2,YjUWPpI6HXG530lwP-fb2A,0,2014-02-05,0,saUsX_uimxRlCVr67Z4Jig,3.0,Family diner. Had the buffet. Eclectic assortm...,0,8g_iMtfSiwikVnbP2etR0A,2014-02-05 20:30:30,20
3,kxX2SOes4o-D3ZQBkiMRfA,1,2015-01-04,0,AqPFMleE6RsU23_auESxiA,5.0,"Wow! Yummy, different, delicious. Our favo...",1,_7bHUi9Uuf5__HHc_Q8guQ,2015-01-04 00:01:03,0
4,e4Vwtrqf-wpJfwesgvdgxQ,1,2017-01-14,0,Sx8TMOWLNuJBWer-0pcmoA,4.0,Cute interior and owner (?) gave us tour of up...,1,bcjbaE6dDog4jkNY91ncLQ,2017-01-14 20:54:15,20


## --------- COLUMN: text ---------

// string, the review text

"text": "Great place to hang out after work: the prices are decent, and the ambience is fun. It's a bit loud, but very lively. The staff is friendly, and the food is good. They have a good selection of drinks."

In [None]:
len(k_review.text.unique())

6974127

In [None]:
k_review['empty_text'] = k_review.text.apply(is_empty)



In [None]:
k_review.empty_text.value_counts()

0    6990280
Name: empty_text, dtype: int64

In [9]:
def get_word_count(str):
  ls = str.split()
  return len(ls)

In [10]:
k_review['word_count'] = k_review.text.apply(get_word_count)



In [11]:
k_review['word_count'].describe()

count    6.990280e+06
mean     1.047763e+02
std      9.792227e+01
min      1.000000e+00
25%      4.200000e+01
50%      7.500000e+01
75%      1.330000e+02
max      1.070000e+03
Name: word_count, dtype: float64

In [12]:
len(k_review[k_review.word_count >= 500])

65090

## --------- COLUMN: stars ---------

// integer, star score from 1 to 5

"stars": 4

In [None]:
# Get the amount of reviews for each possible score

k_review.stars.value_counts()

5.0    3231627
4.0    1452918
1.0    1069561
3.0     691934
2.0     544240
Name: stars, dtype: int64

## --------- COLUMN: useful ---------

// integer, number of votes considering the review 'useful'

"useful": 0

In [None]:
# Most common values for 'useful' calification on a review

k_review.useful.value_counts().head(15)

0     3840492
1     1539953
2      687425
3      343742
4      186984
5      112204
6       71214
7       47679
8       34000
9       24783
10      18475
11      14319
12      11103
13       8751
14       7112
Name: useful, dtype: int64

In [None]:
k_review.useful.max()

1182

## --------- COLUMN: funny ---------

// integer, number of votes considering the review 'funny'
"funny": 0


In [None]:
# Most common values for 'funny' calification on a review

k_review.funny.value_counts().head(15)

0     5894117
1      691994
2      195290
3       82111
4       42254
5       24723
6       15545
7       10178
8        7147
9        5223
10       3739
11       2992
12       2367
13       1896
14       1419
Name: funny, dtype: int64

In [None]:
str(k_review[k_review.funny == k_review.funny.max()].text)

'6885693    Went there for a birthday dinner and had reser...\nName: text, dtype: object'

## --------- COLUMN: cool ---------


// integer, number of votes considering the review 'cool'.

"cool": 0

In [None]:
# Most common values for 'useful' calification on a review

k_review.cool.value_counts().head(15)

0     5377964
1     1016736
2      296999
3      114763
4       56609
5       32352
6       21530
7       15010
8       11028
9        8085
10       6349
11       4981
12       4011
13       3125
14       2549
Name: cool, dtype: int64

In [None]:
def get_dtypes(dataset):
  '''
  This function gets the data types present on each column of a dataset.

  :: param dataset: Pandas or Koalas DataFrame

  Return:
  ---------
  Returns a dictionary that contains the data types for each column of a dataset.
  '''
  # npds = dataset.values
  cols = dataset.columns
  types_dict = {}
  for col in cols:
    types_dict[col] = set()
    for value in dataset[col]:
      types_dict[col].add(type(value))
  return types_dict

In [None]:
len(k_review[k_review.business_id == 'IvQs2hhXKZ4NgZhVZ5jjjQ'])

5

In [None]:
k_review[k_review.business_id == 'IvQs2hhXKZ4NgZhVZ5jjjQ']

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,datetime
4440011,IvQs2hhXKZ4NgZhVZ5jjjQ,0,2008-06-20 13:55:12,0,_wxTSWQpQqi2D2jIQOmhVw,4.0,What a little sandwiches and deli. It serves a...,0,x82wL_fePhJGUB7_ZbLX2g,2008-06-20 13:55:12
4592839,IvQs2hhXKZ4NgZhVZ5jjjQ,0,2010-04-17 18:16:28,0,WDN1QObUHMDgwD603RfBvQ,4.0,When I worked on Rodney Street (ie out of rang...,0,xA6GL6xQ-oGBxTa7UMRYAw,2010-04-17 18:16:28
4616218,IvQs2hhXKZ4NgZhVZ5jjjQ,0,2010-06-12 11:18:49,0,Lcwp3xHAew7mz9Adh1PMVQ,3.0,Pumpernickles is situated a stones throw away ...,0,ERyhMwXG1woQbBMuK68ZQg,2010-06-12 11:18:49
4685310,IvQs2hhXKZ4NgZhVZ5jjjQ,0,2010-06-24 11:26:30,0,WbRCJLvJdO65sHj1LZfOTw,4.0,Pumpernickles is situated in a vast student ar...,0,0pkjvlKAzNZjSVdfNK-GjA,2010-06-24 11:26:30
4738676,IvQs2hhXKZ4NgZhVZ5jjjQ,0,2010-06-10 14:52:08,0,gH-7TIQAeLx5grr6G2PHow,4.0,Pumpernickles. How very Dickensian. I feel as ...,0,t-dVBcTV67DqZM5EVy2SVQ,2010-06-10 14:52:08


## --------- BUSINESS ---------

In [None]:
# Import datasets as spark and convert to koalas
business = spark.read.load("/content/drive/MyDrive/HENRY_TRABAJO_GRUPAL/Dataset_Yelp/business.json", format="json")
k_business = business.to_koalas()

In [None]:
k_business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,"1616 Chapala St, Ste 2","(None, None, None, None, None, None, None, Non...",Pns2l4eNsfO8kk83dixA6A,"Doctors, Traditional Chinese Medicine, Naturop...",Santa Barbara,,0,34.426679,-119.711197,"Abby Rappoport, LAC, CMQ",93101,7,5.0,CA
1,87 Grasso Plaza Shopping Center,"(None, None, None, None, None, None, None, Non...",mpf3x-BjTdTEA3yCZrAYPw,"Shipping Centers, Local Services, Notaries, Ma...",Affton,"(8:0-18:30, 0:0-0:0, 8:0-14:0, None, 8:0-18:30...",1,38.551126,-90.335695,The UPS Store,63123,15,3.0,MO
2,5255 E Broadway Blvd,"(None, None, None, None, None, None, None, Tru...",tUFrWirKiKi_TAnsVWINQQ,"Department Stores, Shopping, Fashion, Home & G...",Tucson,"(8:0-23:0, 8:0-22:0, 8:0-23:0, 8:0-22:0, 8:0-2...",0,32.223236,-110.880452,Target,85711,22,3.5,AZ
3,935 Race St,"(None, None, u'none', None, None, None, None, ...",MTSW4McQd7CbVtyjqoe9mw,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",Philadelphia,"(7:0-21:0, 7:0-20:0, 7:0-21:0, 7:0-21:0, 7:0-2...",1,39.955505,-75.155564,St Honore Pastries,19107,80,4.0,PA
4,101 Walnut St,"(None, None, None, None, None, None, None, Tru...",mWMc6_wTdE0EUBKIGXDVfA,"Brewpubs, Breweries, Food",Green Lane,"(12:0-22:0, None, 12:0-22:0, 12:0-18:0, 16:0-2...",1,40.338183,-75.471659,Perkiomen Valley Brewery,18054,13,4.5,PA


In [None]:
k_business.dtypes

address          object
attributes       object
business_id      object
categories       object
city             object
hours            object
is_open           int64
latitude        float64
longitude       float64
name             object
postal_code      object
review_count      int64
stars           float64
state            object
dtype: object

In [None]:
k_business.categories

0      Doctors, Traditional Chinese Medicine, Naturop...
1      Shipping Centers, Local Services, Notaries, Ma...
2      Department Stores, Shopping, Fashion, Home & G...
3      Restaurants, Food, Bubble Tea, Coffee & Tea, B...
4                              Brewpubs, Breweries, Food
5      Burgers, Fast Food, Sandwiches, Food, Ice Crea...
6      Sporting Goods, Fashion, Shoe Stores, Shopping...
7                    Synagogues, Religious Organizations
8      Pubs, Restaurants, Italian, Bars, American (Tr...
9      Ice Cream & Frozen Yogurt, Fast Food, Burgers,...
10                  Department Stores, Shopping, Fashion
11            Vietnamese, Food, Restaurants, Food Trucks
12     American (Traditional), Restaurants, Diners, B...
13     General Dentistry, Dentists, Health & Medical,...
14           Food, Delis, Italian, Bakeries, Restaurants
15                     Sushi Bars, Restaurants, Japanese
16     Automotive, Auto Parts & Supplies, Auto Custom...
17     Vape Shops, Tobacco Shop

In [None]:
def values_type(dataframe, column): 
  types = set()
  for value in dataframe[column].to_numpy():
    types.add(type(value))
  return types

{<class 'NoneType'>, <class 'pyspark.sql.types.Row'>}


In [None]:
values_type(k_business, 'hours')

In [None]:
type(k_business.hours[50])

pyspark.sql.types.Row

In [None]:
def extract_keys(dataframe, column:str):
  schedule_keys = set()
  for value in dataframe[column].to_numpy():
    try:
      schedule_dict = value.asDict()
      keys = schedule_dict.keys()
      for x in keys:
        schedule_keys.add(x)
    except AttributeError:
      pass
  return schedule_keys

{'Friday', 'Tuesday', 'Wednesday', 'Sunday', 'Thursday', 'Saturday', 'Monday'}


In [None]:
extract_keys(k_business, 'hours')

In [None]:
def get_schedule(value, day:str, time:str):
  try:
    hours_dict = value.asDict()
    hours = hours_dict[day].split('-')
    if time == 'open':
      return hours[0]
    elif time == 'close':
      return hours[1]
  except AttributeError:
    return 'NO DATA'

In [None]:
k_business['monday_open'] = k_business.hours.apply(get_schedule, args=('Monday', 'open'))



In [None]:
k_business.columns

Index(['address', 'attributes', 'business_id', 'categories', 'city', 'hours',
       'is_open', 'latitude', 'longitude', 'name', 'postal_code',
       'review_count', 'stars', 'state', 'monday_open'],
      dtype='object')