#### import packages

In [1]:
import pandas as pd
import sqlite3
import numpy as np

#### define variables

In [2]:
#SQLite3 connection
conn = sqlite3.connect('SQLITE3/yelp_database.sqlite3')
c = conn.cursor()

### BUSINESS CATEGORIES
>as this column contains an array we have to convert the table to first normal form to be able to query/analyse efficiently

#### METHOD 1: create 1NF table with one row per unique pair business_id/category
> method 2 (boolean column per category) will be explored in different notebook

In [3]:
#import SQLite table into df 
business_categories_df = pd.read_sql_query("select business_id,categories from business_main", conn)

In [4]:
#convert strings to list in column "categories"
def split_list(input_string):
    if input_string is not None:
        output_list = input_string.split(", ")
    else:
        output_list = []
    return output_list

business_categories_df['categories'] = business_categories_df['categories'].apply(split_list)

In [5]:
#check contents of column
business_categories_df

Unnamed: 0,business_id,categories
0,6iYb2HFDywm3zjuRg0shjw,"[Gastropubs, Food, Beer Gardens, Restaurants, ..."
1,tCbdrRPZA0oiIYSmHG3J0w,"[Salad, Soup, Sandwiches, Delis, Restaurants, ..."
2,bvN78flM8NLprQ1a1y5dRg,"[Antiques, Fashion, Used, Vintage & Consignmen..."
3,oaepsyvc0J17qwi8cfrOWg,"[Beauty & Spas, Hair Salons]"
4,PE9uqAjdw0E4-8mjGl3wVA,"[Gyms, Active Life, Interval Training Gyms, Fi..."
...,...,...
160580,D2mHoIDXx9N8mS1pGoKV9Q,"[Real Estate, Real Estate Services, Home Servi..."
160581,bQX-kwVTyZgcdZGEPzce6Q,"[Health Markets, Food, Specialty Food, Grocery]"
160582,wvFZ06nmPmQ2-IVoPqVYLA,"[Arts & Entertainment, Paint & Sip, Art Classe..."
160583,GB75wPibj3IjNauaoCxyGA,"[Cuban, Sandwiches, Restaurants, Cafes]"


In [6]:
#test if column is correctly converted to dtype list for first 10 rows
for x, y in enumerate(business_categories_df["categories"].head(10)):
    print("index",x,"is",type(y))

index 0 is <class 'list'>
index 1 is <class 'list'>
index 2 is <class 'list'>
index 3 is <class 'list'>
index 4 is <class 'list'>
index 5 is <class 'list'>
index 6 is <class 'list'>
index 7 is <class 'list'>
index 8 is <class 'list'>
index 9 is <class 'list'>


In [7]:
#loop over categories list, where iterations are counted and can later be referenced with original index
index_data = []
value_data = []
for index,value in enumerate(business_categories_df["categories"]):
    for x in value:
        index_data.append(index)
        value_data.append(x)

In [8]:
#create dataframes out of the appended iteration lists
df_index = pd.DataFrame (index_data, columns = ['index'])
df_value = pd.DataFrame (value_data, columns = ['category'])
#merge both
df_merge_temp = df_index.merge(df_value, left_index=True, right_index=True)

In [9]:
#join our original table (with index) with our merged table (where 'count' corresponds to index)
df_final = business_categories_df.merge(df_merge_temp, how='left', left_index=True, right_on='index')
df_final = df_final.drop(['categories','index'], axis=1)
df_final

Unnamed: 0,business_id,category
0.0,6iYb2HFDywm3zjuRg0shjw,Gastropubs
1.0,6iYb2HFDywm3zjuRg0shjw,Food
2.0,6iYb2HFDywm3zjuRg0shjw,Beer Gardens
3.0,6iYb2HFDywm3zjuRg0shjw,Restaurants
4.0,6iYb2HFDywm3zjuRg0shjw,Bars
...,...,...
708963.0,ngmLL5Y5OT-bYHKU0kKrYA,Restaurants
708964.0,ngmLL5Y5OT-bYHKU0kKrYA,Middle Eastern
708965.0,ngmLL5Y5OT-bYHKU0kKrYA,Mediterranean
708966.0,ngmLL5Y5OT-bYHKU0kKrYA,Persian/Iranian


In [10]:
#) create and fill SQLite3 table (including FK index)
c.execute('DROP TABLE IF EXISTS business_categories')
conn.commit()

c.execute("""CREATE TABLE business_categories(
                business_id text
                ,category text
                ,CONSTRAINT fk_business_id  
                FOREIGN KEY (business_id)  
                REFERENCES business_main(business_id))""")
conn.commit()

c.execute('CREATE INDEX IDX_business_categories on business_categories(business_id)')
conn.commit()

df_final.to_sql('business_categories', conn, if_exists='append', index = False)
conn.commit()

### USERS
>as this column contains an array we have to convert the table to first normal form to be able to query/analyse efficiently

#### METHOD 1: create 1NF table with one row per unique pair business_id/category
> method 2 not feasible -> will result in millions of columns

In [11]:
#import SQLite table into df 
user_friends_df = pd.read_sql_query("select user_id,friends from user", conn)

In [12]:
#convert strings to list in column "categories"
def split_list(input_string):
    if input_string is not None:
        output_list = input_string.split(", ")
    else:
        output_list = []
    return output_list

user_friends_df['friends'] = user_friends_df['friends'].apply(split_list)

In [13]:
#check contents of column
user_friends_df

Unnamed: 0,user_id,friends
0,q_QQ5kBBwlCcbL1s4NVK3g,"[xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWt..."
1,dIIKEfOgo0KqUfGQvGikPg,"[XPzYf9_mwG2eXYP2BAGSTA, 2LooM5dcIk2o01nftYdPI..."
2,D6ErcUnFALnCQN4b1W_TlA,"[GfB6sC4NJQvSI2ewbQrDNA, jhZtzZNNZJOU2YSZ6jPlX..."
3,JnPIjvC0cmooNDfsa9BmXg,"[HQZPQhKMwRAyS6BCselVWQ, kP2U1s_sjQfHO9grxiyDT..."
4,37Hc8hr3cw0iHLoPzLK6Ow,"[-Q88pZUcrfN0BLBDp-bkAQ, etPn4Pv1Gc4cRZjRgB_BO..."
...,...,...
2189452,OrXXOTSQG2hLEoZ4sw03Gg,[None]
2189453,pYZ4Dyx5I92u5gDfGiVTpQ,[None]
2189454,DhrXTJRTLhnvI9UI1q63mg,[None]
2189455,ka06dBaC9tvKhc7DJ9_7wQ,[None]


In [14]:
#test if column is correctly converted to dtype list for first 10 rows
for x, y in enumerate(user_friends_df["friends"].head(10)):
    print("index",x,"is",type(y))

index 0 is <class 'list'>
index 1 is <class 'list'>
index 2 is <class 'list'>
index 3 is <class 'list'>
index 4 is <class 'list'>
index 5 is <class 'list'>
index 6 is <class 'list'>
index 7 is <class 'list'>
index 8 is <class 'list'>
index 9 is <class 'list'>


In [15]:
#loop over friends list, where iterations are counted and can later be referenced with original index
index_data = []
value_data = []
for index,value in enumerate(user_friends_df["friends"]):
    for x in value:
        index_data.append(index)
        value_data.append(x)

In [16]:
#create dataframes out of the appended iteration lists
df_index = pd.DataFrame (index_data, columns = ['index'])
df_value = pd.DataFrame (value_data, columns = ['friend'])
#merge both
df_merge_temp = df_index.merge(df_value, left_index=True, right_index=True)

In [17]:
#join our original table (with index) with our merged table (where 'count' corresponds to index)
df_final = user_friends_df.merge(df_merge_temp, how='left', left_index=True, right_on='index')

In [18]:
df_final = df_final.drop(['friends','index'], axis=1)
df_final

Unnamed: 0,user_id,friend
0,q_QQ5kBBwlCcbL1s4NVK3g,xBDpTUbai0DXrvxCe3X16Q
1,q_QQ5kBBwlCcbL1s4NVK3g,7GPNBO496aecrjJfW6UWtg
2,q_QQ5kBBwlCcbL1s4NVK3g,gUfHciSP7BbxZd5gj-c4xw
3,q_QQ5kBBwlCcbL1s4NVK3g,NXw0bCLF5ZtFMfhcj7CFSw
4,q_QQ5kBBwlCcbL1s4NVK3g,OGjmMxPuIoLTJ3O-CO2A4g
...,...,...
116040968,OrXXOTSQG2hLEoZ4sw03Gg,
116040969,pYZ4Dyx5I92u5gDfGiVTpQ,
116040970,DhrXTJRTLhnvI9UI1q63mg,
116040971,ka06dBaC9tvKhc7DJ9_7wQ,


In [19]:
#) create and fill SQLite3 table (including FK index)
c.execute('DROP TABLE IF EXISTS user_friends')
conn.commit()

c.execute("""CREATE TABLE user_friends(
                user_id text
                ,friend text
                ,CONSTRAINT fk_user_id  
                FOREIGN KEY (user_id)  
                REFERENCES user(user_id))""")
conn.commit()

c.execute('CREATE INDEX IDX_user_friends on user_friends(user_id)')
conn.commit()

df_final.to_sql('user_friends', conn, if_exists='append', index = False)
conn.commit()