#### import packages

In [1]:
import pandas as pd
import sqlite3
import numpy as np

#### define variables

In [2]:
#SQLite3 connection
conn = sqlite3.connect('SQLITE3/yelp_database.sqlite3')
c = conn.cursor()

### BUSINESS CATEGORIES
>as this column contains an array we have to convert the table to first normal form to be able to query/analyse efficiently

#### DF import & clean

In [3]:
#import SQLite table into df 
business_categories_df = pd.read_sql_query("select business_id,categories from business_main", conn)

In [4]:
#convert strings to list in column "categories"
def split_list(input_string):
    if input_string is not None:
        output_list = input_string.split(", ")
    else:
        output_list = []
    return output_list

business_categories_df['categories'] = business_categories_df['categories'].apply(split_list)

In [5]:
#check contents of column
business_categories_df

Unnamed: 0,business_id,categories
0,6iYb2HFDywm3zjuRg0shjw,"[Gastropubs, Food, Beer Gardens, Restaurants, ..."
1,tCbdrRPZA0oiIYSmHG3J0w,"[Salad, Soup, Sandwiches, Delis, Restaurants, ..."
2,bvN78flM8NLprQ1a1y5dRg,"[Antiques, Fashion, Used, Vintage & Consignmen..."
3,oaepsyvc0J17qwi8cfrOWg,"[Beauty & Spas, Hair Salons]"
4,PE9uqAjdw0E4-8mjGl3wVA,"[Gyms, Active Life, Interval Training Gyms, Fi..."
...,...,...
160580,D2mHoIDXx9N8mS1pGoKV9Q,"[Real Estate, Real Estate Services, Home Servi..."
160581,bQX-kwVTyZgcdZGEPzce6Q,"[Health Markets, Food, Specialty Food, Grocery]"
160582,wvFZ06nmPmQ2-IVoPqVYLA,"[Arts & Entertainment, Paint & Sip, Art Classe..."
160583,GB75wPibj3IjNauaoCxyGA,"[Cuban, Sandwiches, Restaurants, Cafes]"


In [6]:
#test if column is correctly converted to dtype list for first 10 rows
for x, y in enumerate(business_categories_df["categories"].head(10)):
    print("index",x,"is",type(y))

index 0 is <class 'list'>
index 1 is <class 'list'>
index 2 is <class 'list'>
index 3 is <class 'list'>
index 4 is <class 'list'>
index 5 is <class 'list'>
index 6 is <class 'list'>
index 7 is <class 'list'>
index 8 is <class 'list'>
index 9 is <class 'list'>


#### METHOD 2: create boolean table with one row per business_id but every category is unique column

In [7]:
#create dictionairy of unique items (=key), including number of appearances (=value)
business_categories_dict = {}
for x in business_categories_df['categories']:
    if x is not None:
        for y in x:
            if y not in business_categories_dict:
                business_categories_dict[y] = 1
            else:
                business_categories_dict[y] += 1
    else:
        continue

In [8]:
print(business_categories_dict)

{'Gastropubs': 502, 'Food': 29469, 'Beer Gardens': 157, 'Restaurants': 50763, 'Bars': 10741, 'American (Traditional)': 6541, 'Beer Bar': 738, 'Nightlife': 11990, 'Breweries': 749, 'Salad': 2649, 'Soup': 986, 'Sandwiches': 7272, 'Delis': 1850, 'Cafes': 2871, 'Vegetarian': 1482, 'Antiques': 603, 'Fashion': 6599, 'Used': 815, 'Vintage & Consignment': 815, 'Shopping': 26205, 'Furniture Stores': 1667, 'Home & Garden': 5499, 'Beauty & Spas': 16574, 'Hair Salons': 5900, 'Gyms': 2052, 'Active Life': 9231, 'Interval Training Gyms': 253, 'Fitness & Instruction': 4741, 'Thai': 1363, 'Dentists': 3139, 'Health & Medical': 15102, 'Orthodontists': 803, 'Breakfast & Brunch': 5505, 'Jewelry Repair': 354, 'Appraisal Services': 105, 'Local Services': 12192, 'Jewelry': 1788, 'Engraving': 39, 'Gold Buyers': 144, 'Barbers': 1733, 'Convenience Stores': 1692, 'Souvenir Shops': 92, 'Wigs': 94, 'Hair Extensions': 664, 'Blow Dry/Out Services': 543, 'Hair Stylists': 1634, 'Pizza': 5756, 'American (New)': 5458, 'B

In [9]:
#create dataframe with booleans
def boolean_df(item_lists, unique_items):
    # Create empty dict
    bool_dict = {}
    # Loop through all the tags
    for i, item in enumerate(unique_items):
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: item in x)
    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)

In [10]:
business_categories_bool = boolean_df(business_categories_df['categories'], business_categories_dict.keys())

In [11]:
final_df = business_categories_df.merge(business_categories_bool,left_index=True, right_index=True)
final_df

Unnamed: 0,business_id,categories,Gastropubs,Food,Beer Gardens,Restaurants,Bars,American (Traditional),Beer Bar,Nightlife,...,Court Reporters,Nephrologists,Mohels,Christmas Markets,Patent Law,Pickleball,Pita,Oriental,Free Diving,Fischbroetchen
0,6iYb2HFDywm3zjuRg0shjw,"[Gastropubs, Food, Beer Gardens, Restaurants, ...",True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,tCbdrRPZA0oiIYSmHG3J0w,"[Salad, Soup, Sandwiches, Delis, Restaurants, ...",False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,bvN78flM8NLprQ1a1y5dRg,"[Antiques, Fashion, Used, Vintage & Consignmen...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,oaepsyvc0J17qwi8cfrOWg,"[Beauty & Spas, Hair Salons]",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,PE9uqAjdw0E4-8mjGl3wVA,"[Gyms, Active Life, Interval Training Gyms, Fi...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160580,D2mHoIDXx9N8mS1pGoKV9Q,"[Real Estate, Real Estate Services, Home Servi...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
160581,bQX-kwVTyZgcdZGEPzce6Q,"[Health Markets, Food, Specialty Food, Grocery]",False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
160582,wvFZ06nmPmQ2-IVoPqVYLA,"[Arts & Entertainment, Paint & Sip, Art Classe...",False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
160583,GB75wPibj3IjNauaoCxyGA,"[Cuban, Sandwiches, Restaurants, Cafes]",False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
