In [1]:
import numpy as np
import pandas as pd
import json

# YELP Data

In [2]:
with open('yelp_data_berlin.json') as f:
    yelp_data = json.load(f)
f.close()

In [3]:
yelp_df = pd.json_normalize(yelp_data)

In [4]:
yelp_df.dtypes

id                           object
alias                        object
name                         object
image_url                    object
is_closed                      bool
url                          object
review_count                  int64
categories                   object
rating                      float64
transactions                 object
price                        object
phone                        object
display_phone                object
distance                    float64
coordinates.latitude        float64
coordinates.longitude       float64
location.address1            object
location.address2            object
location.address3            object
location.city                object
location.zip_code            object
location.country             object
location.state               object
location.display_address     object
dtype: object

In [5]:
yelp_df = yelp_df.drop_duplicates(subset=['alias'], keep='first')

In [6]:
# Unpack the list of different categories and add them to new columns
cat_df = yelp_df.categories.apply(pd.Series)

In [7]:
cat_df

Unnamed: 0,0,1,2
0,"{'alias': 'coffee', 'title': 'Coffee & Tea'}","{'alias': 'bagels', 'title': 'Bagels'}",
1,"{'alias': 'coffee', 'title': 'Coffee & Tea'}",,
2,"{'alias': 'bakeries', 'title': 'Bakeries'}","{'alias': 'cafes', 'title': 'Cafes'}",
3,"{'alias': 'coffee', 'title': 'Coffee & Tea'}","{'alias': 'breakfast_brunch', 'title': 'Breakf...","{'alias': 'cocktailbars', 'title': 'Cocktail B..."
4,"{'alias': 'icecream', 'title': 'Ice Cream & Fr...",,
...,...,...,...
147400,"{'alias': 'parks', 'title': 'Parks'}",,
148268,"{'alias': 'landmarks', 'title': 'Landmarks & H...","{'alias': 'parks', 'title': 'Parks'}",
148429,"{'alias': 'landmarks', 'title': 'Landmarks & H...","{'alias': 'parks', 'title': 'Parks'}",
148464,"{'alias': 'parks', 'title': 'Parks'}",,


In [8]:
yelp_df['category1'] = cat_df[0]
yelp_df['category2'] = cat_df[1]
yelp_df['category3'] = cat_df[2]

In [9]:
# Unpack the alias:title description and add them to new columns
category1_df = yelp_df.category1.apply(pd.Series)
category2_df = yelp_df.category2.apply(pd.Series)
category3_df = yelp_df.category3.apply(pd.Series)

In [10]:
yelp_df['category1'] = category1_df['alias']
yelp_df['category2'] = category2_df['alias']
yelp_df['category3'] = category3_df['alias']

In [11]:
yelp_df['title1'] = category1_df['title']
yelp_df['title2'] = category2_df['title']
yelp_df['title3'] = category3_df['title']

In [12]:
yelp_df

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,transactions,...,location.zip_code,location.country,location.state,location.display_address,category1,category2,category3,title1,title2,title3
0,j1KMoWRKHnDTqKBEVM45bw,cuccuma-berlin,Cuccuma,https://s3-media3.fl.yelpcdn.com/bphoto/SRfr9H...,False,https://www.yelp.com/biz/cuccuma-berlin?adjust...,94,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.5,[],...,10961,DE,BE,"[Zossener Str. 34, 10961 Berlin, Germany]",coffee,bagels,,Coffee & Tea,Bagels,
1,gYt8OaT4YGLYZHO13tCB_w,chapter-one-berlin-2,Chapter One,https://s3-media2.fl.yelpcdn.com/bphoto/9XqE1k...,False,https://www.yelp.com/biz/chapter-one-berlin-2?...,50,"[{'alias': 'coffee', 'title': 'Coffee & Tea'}]",4.5,[],...,10961,DE,BE,"[Mittenwalder Str. 30, 10961 Berlin, Germany]",coffee,,,Coffee & Tea,,
2,TP29h1ATpovLtBaud1tUYg,barcomis-berlin,Barcomi's,https://s3-media3.fl.yelpcdn.com/bphoto/11LH7s...,False,https://www.yelp.com/biz/barcomis-berlin?adjus...,165,"[{'alias': 'bakeries', 'title': 'Bakeries'}, {...",3.5,[],...,10961,DE,BE,"[Bergmannstr. 21, 10961 Berlin, Germany]",bakeries,cafes,,Bakeries,Cafes,
3,XIKten0K1qgmr9goF0KJZA,rubens-coffee-lounge-berlin,Rubens Coffee Lounge,https://s3-media4.fl.yelpcdn.com/bphoto/wbon56...,False,https://www.yelp.com/biz/rubens-coffee-lounge-...,114,"[{'alias': 'coffee', 'title': 'Coffee & Tea'},...",4.0,[],...,10961,DE,BE,"[Mehringdamm 65, 10961 Berlin, Germany]",coffee,breakfast_brunch,cocktailbars,Coffee & Tea,Breakfast & Brunch,Cocktail Bars
4,o-Mxr9J4socKoJ-jIedo3g,vanille-und-marille-berlin-16,vanille & marille,https://s3-media2.fl.yelpcdn.com/bphoto/DEUH5w...,False,https://www.yelp.com/biz/vanille-und-marille-b...,210,"[{'alias': 'icecream', 'title': 'Ice Cream & F...",4.5,[],...,10965,DE,BE,"[Hagelberger Str. 1, 10965 Berlin, Germany]",icecream,,,Ice Cream & Frozen Yogurt,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147400,VOhaJpFu1eLZJ_P-yXRfKw,steinstücken-zehlendorf-berlin,Steinstücken Zehlendorf,,False,https://www.yelp.com/biz/steinst%C3%BCcken-zeh...,2,"[{'alias': 'parks', 'title': 'Parks'}]",5.0,[],...,14109,DE,BE,"[Bernhard Beyer Str. 10, Zehlendorf Babelsber...",parks,,,Parks,,
148268,TdK5UjRYpOGodZcDFGorvg,freundschaftsinsel-potsdam,Freundschaftsinsel,https://s3-media1.fl.yelpcdn.com/bphoto/HArbOG...,False,https://www.yelp.com/biz/freundschaftsinsel-po...,26,"[{'alias': 'landmarks', 'title': 'Landmarks & ...",4.5,[],...,14469,DE,BB,"[Lange Brücke, 14469 Potsdam, Germany]",landmarks,parks,,Landmarks & Historical Buildings,Parks,
148429,b5URWoI-XrdK4607vEjY8Q,park-sanssouci-potsdam,Park Sanssouci,https://s3-media1.fl.yelpcdn.com/bphoto/d8vKcc...,False,https://www.yelp.com/biz/park-sanssouci-potsda...,85,"[{'alias': 'landmarks', 'title': 'Landmarks & ...",5.0,[],...,14469,DE,BB,"[Zur Historischen Mühle, 14469 Potsdam, Germany]",landmarks,parks,,Landmarks & Historical Buildings,Parks,
148464,b8YYCBveyZWDatuWI4S1Qw,ruinenberg-potsdam,Ruinenberg,https://s3-media2.fl.yelpcdn.com/bphoto/k4jgi0...,False,https://www.yelp.com/biz/ruinenberg-potsdam?ad...,4,"[{'alias': 'parks', 'title': 'Parks'}]",4.0,[],...,14473,DE,BB,"[Ruinenbergstraße, 14473 Potsdam, Germany]",parks,,,Parks,,


In [13]:
yelp_df.reset_index(drop=True, inplace=True)

In [14]:
yelp_df.dtypes

id                           object
alias                        object
name                         object
image_url                    object
is_closed                      bool
url                          object
review_count                  int64
categories                   object
rating                      float64
transactions                 object
price                        object
phone                        object
display_phone                object
distance                    float64
coordinates.latitude        float64
coordinates.longitude       float64
location.address1            object
location.address2            object
location.address3            object
location.city                object
location.zip_code            object
location.country             object
location.state               object
location.display_address     object
category1                    object
category2                    object
category3                    object
title1                      

In [15]:
yelp_df['is_closed'].value_counts()

False    19137
Name: is_closed, dtype: int64

In [16]:
# Change order of columns and drop unnecessary columns
yelp_df = yelp_df[['id', 'alias', 'name', 'url', 'review_count', 'rating', 'price', 'location.address1', 'location.address2', 'location.address3', 'location.city', 'location.zip_code', 'location.country', 'location.state', 'location.display_address', 'category1', 'title1', 'category2', 'title2', 'category3', 'title3']]

In [17]:
yelp_df

Unnamed: 0,id,alias,name,url,review_count,rating,price,location.address1,location.address2,location.address3,...,location.zip_code,location.country,location.state,location.display_address,category1,title1,category2,title2,category3,title3
0,j1KMoWRKHnDTqKBEVM45bw,cuccuma-berlin,Cuccuma,https://www.yelp.com/biz/cuccuma-berlin?adjust...,94,4.5,€,Zossener Str. 34,,,...,10961,DE,BE,"[Zossener Str. 34, 10961 Berlin, Germany]",coffee,Coffee & Tea,bagels,Bagels,,
1,gYt8OaT4YGLYZHO13tCB_w,chapter-one-berlin-2,Chapter One,https://www.yelp.com/biz/chapter-one-berlin-2?...,50,4.5,€,Mittenwalder Str. 30,,,...,10961,DE,BE,"[Mittenwalder Str. 30, 10961 Berlin, Germany]",coffee,Coffee & Tea,,,,
2,TP29h1ATpovLtBaud1tUYg,barcomis-berlin,Barcomi's,https://www.yelp.com/biz/barcomis-berlin?adjus...,165,3.5,€€,Bergmannstr. 21,,,...,10961,DE,BE,"[Bergmannstr. 21, 10961 Berlin, Germany]",bakeries,Bakeries,cafes,Cafes,,
3,XIKten0K1qgmr9goF0KJZA,rubens-coffee-lounge-berlin,Rubens Coffee Lounge,https://www.yelp.com/biz/rubens-coffee-lounge-...,114,4.0,€€,Mehringdamm 65,,,...,10961,DE,BE,"[Mehringdamm 65, 10961 Berlin, Germany]",coffee,Coffee & Tea,breakfast_brunch,Breakfast & Brunch,cocktailbars,Cocktail Bars
4,o-Mxr9J4socKoJ-jIedo3g,vanille-und-marille-berlin-16,vanille & marille,https://www.yelp.com/biz/vanille-und-marille-b...,210,4.5,€,Hagelberger Str. 1,,,...,10965,DE,BE,"[Hagelberger Str. 1, 10965 Berlin, Germany]",icecream,Ice Cream & Frozen Yogurt,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,VOhaJpFu1eLZJ_P-yXRfKw,steinstücken-zehlendorf-berlin,Steinstücken Zehlendorf,https://www.yelp.com/biz/steinst%C3%BCcken-zeh...,2,5.0,,Bernhard Beyer Str. 10,Zehlendorf Babelsberg Potsdam Kohlhasenbrück,,...,14109,DE,BE,"[Bernhard Beyer Str. 10, Zehlendorf Babelsber...",parks,Parks,,,,
19133,TdK5UjRYpOGodZcDFGorvg,freundschaftsinsel-potsdam,Freundschaftsinsel,https://www.yelp.com/biz/freundschaftsinsel-po...,26,4.5,,Lange Brücke,,,...,14469,DE,BB,"[Lange Brücke, 14469 Potsdam, Germany]",landmarks,Landmarks & Historical Buildings,parks,Parks,,
19134,b5URWoI-XrdK4607vEjY8Q,park-sanssouci-potsdam,Park Sanssouci,https://www.yelp.com/biz/park-sanssouci-potsda...,85,5.0,,Zur Historischen Mühle,,,...,14469,DE,BB,"[Zur Historischen Mühle, 14469 Potsdam, Germany]",landmarks,Landmarks & Historical Buildings,parks,Parks,,
19135,b8YYCBveyZWDatuWI4S1Qw,ruinenberg-potsdam,Ruinenberg,https://www.yelp.com/biz/ruinenberg-potsdam?ad...,4,4.0,,Ruinenbergstraße,,,...,14473,DE,BB,"[Ruinenbergstraße, 14473 Potsdam, Germany]",parks,Parks,,,,


In [18]:
yelp_df['location.city'].value_counts()

Berlin                 18039
Potsdam                  432
Wildau                    62
Erkner                    43
Schönefeld                40
                       ...  
Berlín                     1
Oranienburg                1
Neu Zittau                 1
Friedenau                  1
Eiche - Ahrensfelde        1
Name: location.city, Length: 113, dtype: int64

In [25]:
yelp_df.to_csv('yelp_data_cleaned.csv', index = False, encoding='utf-8-sig')