In [9]:
import pandas as pd
import psycopg
import json

In [3]:
def get_db_connection():
    """Establishes and returns a connection to the PostgreSQL database."""
    try:
        conn = psycopg.connect("postgresql://michellelin@localhost/yelp")
        return conn
    except Exception as e:
        print(f"Error connecting to the database: {e}")
        return None

# Denormalized Data

## Create the data schema

In [17]:
# Connect to the PostgreSQL database
conn = get_db_connection()
cursor = conn.cursor()

# Full SQL schema definition
schema_sql = """
DROP TABLE IF EXISTS review CASCADE;
DROP TABLE IF EXISTS checkin CASCADE;
DROP TABLE IF EXISTS tip CASCADE;
DROP TABLE IF EXISTS photo CASCADE;
DROP TABLE IF EXISTS business CASCADE;
DROP TABLE IF EXISTS yelp_user CASCADE;

CREATE TABLE IF NOT EXISTS business (
    business_id VARCHAR(22) PRIMARY KEY,
    name VARCHAR NOT NULL,
    address VARCHAR,
    city VARCHAR,
    state VARCHAR(2),
    postal_code VARCHAR(10),
    latitude NUMERIC,
    longitude NUMERIC,
    stars NUMERIC CHECK (stars BETWEEN 0 AND 5),
    review_count INTEGER,
    is_open INTEGER CHECK (is_open IN (0, 1)),
    attributes TEXT,
    categories TEXT, 
    hours TEXT
);

CREATE TABLE IF NOT EXISTS yelp_user (
    user_id VARCHAR(22) PRIMARY KEY,
    name TEXT,
    review_count INTEGER,
    yelping_since DATE,
    friends TEXT, 
    useful INTEGER,
    funny INTEGER,
    cool INTEGER,
    fans INTEGER,
    elite TEXT, 
    average_stars NUMERIC CHECK (average_stars BETWEEN 0 AND 5),
    compliment_hot INTEGER,
    compliment_more INTEGER,
    compliment_profile INTEGER,
    compliment_cute INTEGER,
    compliment_list INTEGER,
    compliment_note INTEGER,
    compliment_plain INTEGER,
    compliment_cool INTEGER,
    compliment_funny INTEGER,
    compliment_writer INTEGER,
    compliment_photos INTEGER
);

CREATE TABLE IF NOT EXISTS review (
    review_id VARCHAR(22) PRIMARY KEY,
    user_id VARCHAR(22) REFERENCES yelp_user (user_id) ON DELETE CASCADE,
    business_id VARCHAR(22) REFERENCES business (business_id) ON DELETE CASCADE,
    stars NUMERIC CHECK (stars BETWEEN 0 AND 5),
    date DATE NOT NULL,
    text TEXT,
    useful INTEGER,
    funny INTEGER,
    cool INTEGER
);

CREATE TABLE IF NOT EXISTS checkin (
    business_id VARCHAR(22) REFERENCES business (business_id) ON DELETE CASCADE,
    date_time TIMESTAMP NOT NULL,
    PRIMARY KEY (business_id, date_time)
);

CREATE TABLE IF NOT EXISTS tip (
    user_id VARCHAR(22) REFERENCES yelp_user (user_id) ON DELETE CASCADE,
    business_id VARCHAR(22) REFERENCES business (business_id) ON DELETE CASCADE,
    text TEXT NOT NULL,
    date DATE NOT NULL,
    compliment_count INTEGER DEFAULT 0,
    PRIMARY KEY (user_id, business_id, date)
);

CREATE TABLE IF NOT EXISTS photo (
    photo_id VARCHAR(22) PRIMARY KEY,
    business_id VARCHAR(22) REFERENCES business (business_id) ON DELETE CASCADE,
    caption TEXT,
    label TEXT
);
"""

# Execute the schema to create the tables
cursor.execute(schema_sql)

# Commit and close
conn.commit()
cursor.close()
conn.close()
print("Tables created successfully.")


Tables created successfully.


## Insert data

### business

In [19]:
# Step 1: Read the CSV file for 'business'
df_business = pd.read_csv('../sampled_data/business_sample.csv')

# Convert columns to plain text
df_business['attributes'] = df_business['attributes'].astype(str)
df_business['categories'] = df_business['categories'].astype(str)
df_business['hours'] = df_business['hours'].astype(str)

# Replace NaN or None with NULL in the DataFrame for columns that allow NULL
df_business['attributes'] = df_business['attributes'].replace({pd.NA: None, 'nan': None})
df_business['categories'] = df_business['categories'].replace({pd.NA: None, 'nan': None})
df_business['hours'] = df_business['hours'].replace({pd.NA: None, 'nan': None})

# View the first few rows to ensure everything looks correct
print(df_business.head())


              business_id                  name                 address  \
0  UVfYRQIr6u_WrFSY5Cv7xw               InCycle          736 Hanover Pl   
1  wH6QxQv31IJ-qQ-FKIc5TA        Roman Plumbing         6125 Grand Blvd   
2  K1NdCTaJWK1ezkZpPyB7-A    Desert Rain Coffee     1551 E Tangerine Rd   
3  juL8ovMlnjkXNACxZ8HLAQ  Sam Levitz Furniture  3750 W Orange Grove Rd   
4  2eN2pfPCear_ofmgQ0peCw     David Caplan, CPA     301 Andorra Glen Ct   

              city state postal_code   latitude   longitude  stars  \
0           Carmel    IN       46032  39.969726  -86.127815    5.0   
1  New Port Richey    FL       34652  28.247926  -82.719939    4.5   
2       Oro Valley    AZ       85755  32.429167 -110.948631    5.0   
3           Tucson    AZ       85741  32.324899 -111.044485    2.0   
4   Lafayette Hill    PA       19444  40.088457  -75.267412    4.5   

   review_count  is_open                                         attributes  \
0            32        1  {'GoodForKids': 'False'

In [32]:

def bulk_insert(conn, table_name, dataframe, conflict_column=None):
    """
    Inserts data from a Pandas DataFrame into a PostgreSQL table.

    Args:
        conn: PostgreSQL connection object.
        table_name: The name of the PostgreSQL table to insert into.
        dataframe: The Pandas DataFrame containing the data to be inserted.
        conflict_column: The column name to use in the ON CONFLICT clause.
    """
    try:
        # Get column names from the DataFrame
        columns = ', '.join(dataframe.columns)
        
        # Create a placeholder string for the VALUES portion of the SQL
        placeholders = ', '.join(['%s'] * len(dataframe.columns))
        
        # Full SQL insert query with dynamic conflict column
        if conflict_column:
            query = f"""
            INSERT INTO {table_name} ({columns}) 
            VALUES ({placeholders}) 
            ON CONFLICT ({conflict_column}) DO NOTHING;
            """
        else:
            query = f"""
            INSERT INTO {table_name} ({columns}) 
            VALUES ({placeholders});
            """
        
        # Convert DataFrame to a list of tuples
        data = [tuple(x) for x in dataframe.to_numpy()]
        
        with conn.cursor() as cursor:
            cursor.executemany(query, data)
        
        # Commit the changes
        conn.commit()
        print(f"Successfully inserted {len(data)} rows into {table_name}.")
        
    except Exception as e:
        print(f"Error inserting data into {table_name}: {e}")

# Connect to the PostgreSQL database
conn = get_db_connection()

# Call the bulk_insert function to load the data
bulk_insert(conn, 'business', df_business)

# Close the connection
conn.close()


Error inserting data into business: duplicate key value violates unique constraint "business_pkey"
DETAIL:  Key (business_id)=(UVfYRQIr6u_WrFSY5Cv7xw) already exists.


In [33]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [34]:
%sql postgresql://michellelin@localhost:5432/yelp

In [37]:
%%sql
SELECT * FROM business LIMIT 1;

business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
UVfYRQIr6u_WrFSY5Cv7xw,InCycle,736 Hanover Pl,Carmel,IN,46032,39.9697264,-86.1278146,5,32,1,"{'GoodForKids': 'False', 'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'BusinessParking': ""{'garage': True, 'street': True, 'validated': False, 'lot': True, 'valet': False}"", 'BikeParking': 'True', 'WheelchairAccessible': 'True'}","Active Life, Cycling Classes, Gyms, Fitness & Instruction, Barre Classes","{'Monday': '16:0-20:0', 'Tuesday': '16:0-20:0', 'Wednesday': '16:0-20:0', 'Thursday': '16:0-20:0', 'Friday': '16:0-20:0', 'Saturday': '7:30-12:0', 'Sunday': '8:0-14:0'}"


### yelp_user

In [35]:
# Step 1: Read the CSV file for 'yelp_user'
df_user = pd.read_csv('../sampled_data/users_sample.csv')

# Step 2: Connect to the PostgreSQL database
conn = get_db_connection()

# Step 3: Insert data into the 'yelp_user' table
bulk_insert(conn, 'yelp_user', df_user, conflict_column='user_id')


Successfully inserted 42593 rows into yelp_user.


In [39]:
%%sql
SELECT COUNT(*) FROM yelp_user;

count
42593


### review

In [40]:
# Step 1: Read the CSV file for 'reviews_sample'
df_review = pd.read_csv('../sampled_data/reviews_sample.csv')

# Step 2: Connect to the PostgreSQL database
conn = get_db_connection()

# Step 3: Insert data into the 'yelp_user' table
bulk_insert(conn, 'review', df_review, conflict_column='review_id')

Successfully inserted 48743 rows into review.


In [41]:
%%sql
SELECT COUNT(*) FROM review;

count
48743


In [42]:
%%sql
SELECT * FROM review LIMIT 1;

review_id,user_id,business_id,stars,date,text,useful,funny,cool
5wPq_FqKNx4NSdjW4MDCRQ,jHcTMCmyetM7HSC7weQTlg,y44MbCvvtmg1FpkNGSWisw,2,2015-08-30,"Hate to be harsh but dang! After having a limited selection at terminal F I finally decided to go with Tony Luke's. I was looking forward in trying there Philly Cheese Steak been in Philly and all. So I unraveled the sandwich and to no surprise the sandwich I got didn't look nothing like the picture on the overhead display of the sandwich. The part that got me even more was the meat. I don't know what it taste like to eat dog food, but I am sure I just paid $10 for it.",2,1,0


### checkin

In [58]:
# def bulk_insert(conn, table_name, dataframe, conflict_column=None):
#     """
#     Insert data from a DataFrame into a PostgreSQL table using psycopg2.
    
#     Args:
#         conn: Connection object to the database.
#         table_name: Name of the target table.
#         dataframe: Pandas DataFrame containing the data to be inserted.
#         conflict_column: Column(s) to handle ON CONFLICT clause (string or list of columns).
#     """
#     cursor = conn.cursor()
#     try:
#         # Create a list of column names for the INSERT statement
#         columns = ', '.join([f'"{col}"' for col in dataframe.columns])  # Quote column names to avoid reserved words
#         placeholders = ', '.join(['%s'] * len(dataframe.columns))
        
#         # Create the SQL insert query
#         insert_query = f"""
#         INSERT INTO {table_name} ({columns}) 
#         VALUES ({placeholders})
#         """
        
#         # Add ON CONFLICT clause
#         if conflict_column:
#             if isinstance(conflict_column, list):  # If it's a list of columns, create composite key
#                 conflict_column = ', '.join([f'"{col}"' for col in conflict_column])  # Add quotes
#             else:
#                 conflict_column = f'"{conflict_column}"'  # Handle single column key
            
#             insert_query += f"""
#             ON CONFLICT ({conflict_column}) DO NOTHING
#             """
        
#         # Use executemany to insert all the rows
#         cursor.executemany(insert_query, dataframe.values.tolist())
        
#         # Commit the changes
#         conn.commit()
        
#         print(f"Successfully inserted {len(dataframe)} rows into {table_name}.")
#     except Exception as e:
#         conn.rollback()
#         print(f"Error inserting data into {table_name}: {e}")
#     finally:
#         cursor.close()


In [47]:
# df_checkin = pd.read_csv('../sampled_data/checkins_sample.csv')
# df_checkin.rename(columns={'date': 'date_time'}, inplace=True)
# conn = get_db_connection()
# bulk_insert(conn, 'checkin', df_checkin, conflict_column=['business_id', 'date_time'])

Error inserting data into checkin: invalid input syntax for type timestamp: "2012-07-19 22:37:27, 2012-07-21 00:43:42, 2012-07-24 21:27:12, 2012-07-26 00:44:05, 2012-07-29 00:15:31, 2012-07-31 00:46:10, 2012-07-31 22:26:25, 2012-08-01 15:49:05, 2012-08-02 22:50:16, 2012-08-04 22:37:24, 2012-08-05 03:51:12, 2012-08-05 16:40:28, 2012-08-05 16:40:58, 2012-08-05 20:37:22, 2012-08-05 22:38:09, 2012-08-07 22:13:46, 2012-08-07 22:15:08, 2012-08-08 23:46:11, 2012-08-08 23:47:24, 2012-08-08 23:49:35, 2012-08-08 23:49:58, 2012-08-08 23:50:28, 2012-08-08 23:50:32, 2012-08-08 23:59:43, 2012-08-09 00:10:51, 2012-08-10 16:43:47, 2012-08-10 23:24:04, 2012-08-11 14:13:07, 2012-08-12 01:07:18, 2012-08-12 01:52:45, 2012-08-12 01:59:58, 2012-08-12 02:39:32, 2012-08-12 21:20:16, 2012-08-12 22:15:36, 2012-08-12 22:22:08, 2012-08-14 00:01:14, 2012-08-14 22:10:57, 2012-08-15 00:34:05, 2012-08-15 01:20:52, 2012-08-15 01:34:17, 2012-08-15 13:56:58, 2012-08-15 23:23:22, 2012-08-16 00:44:43, 2012-08-16 22:07:21,

### tip

In [48]:
# Step 1: Read the CSV file
df_tip = pd.read_csv('../sampled_data/tips_sample.csv')

# Step 2: Connect to the PostgreSQL database
conn = get_db_connection()

# Step 3: Insert data into the table
bulk_insert(conn, 'tip', df_tip, conflict_column=['user_id', 'business_id', 'date'])

Successfully inserted 3171 rows into tip.


In [49]:
%%sql
SELECT * FROM tip LIMIT 1;

user_id,business_id,text,date,compliment_count
imhUgZdEXe-JLZT381S6-w,bp5Mk2d0qofUeF5uLauIbg,Everything on the menu is authentic!,2017-06-14,0


# Normalized Data

## Create the data schema

### normalized business data

In [73]:
# Connect to the PostgreSQL database
conn = get_db_connection()
cursor = conn.cursor()

# Full SQL schema definition
schema_sql = """
DROP TABLE IF EXISTS location CASCADE;
DROP TABLE IF EXISTS nor_business CASCADE;

CREATE TABLE IF NOT EXISTS nor_business (
    business_id VARCHAR(22) PRIMARY KEY,
    name VARCHAR NOT NULL,
    stars NUMERIC CHECK (stars BETWEEN 0 AND 5),
    review_count INTEGER,
    is_open INTEGER CHECK (is_open IN (0, 1)),
    attributes TEXT,
    categories TEXT, 
    hours TEXT
);

CREATE TABLE IF NOT EXISTS location (
    business_id VARCHAR(22) REFERENCES nor_business (business_id) ON DELETE CASCADE,
    address VARCHAR,
    city VARCHAR,
    state VARCHAR(2),
    postal_code VARCHAR(10),
    latitude NUMERIC,
    longitude NUMERIC
);

"""

# Execute the schema to create the tables
cursor.execute(schema_sql)

# Commit and close
conn.commit()
cursor.close()
conn.close()
print("Tables created successfully.")


Tables created successfully.


In [75]:
## normalized business
# Step 1: Read the CSV file
df_nor_business = df_business[['business_id', 'name', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']]
df_nor_business

# Step 2: Connect to the PostgreSQL database
conn = get_db_connection()

# Step 3: Insert data into the table
bulk_insert(conn, 'nor_business', df_nor_business, conflict_column=['business_id'])

Successfully inserted 1000 rows into nor_business.


In [76]:
%%sql
SELECT * FROM nor_business LIMIT 5;

business_id,name,stars,review_count,is_open,attributes,categories,hours
UVfYRQIr6u_WrFSY5Cv7xw,InCycle,5.0,32,1,"{'GoodForKids': 'False', 'BusinessAcceptsCreditCards': 'True', 'ByAppointmentOnly': 'False', 'BusinessParking': ""{'garage': True, 'street': True, 'validated': False, 'lot': True, 'valet': False}"", 'BikeParking': 'True', 'WheelchairAccessible': 'True'}","Active Life, Cycling Classes, Gyms, Fitness & Instruction, Barre Classes","{'Monday': '16:0-20:0', 'Tuesday': '16:0-20:0', 'Wednesday': '16:0-20:0', 'Thursday': '16:0-20:0', 'Friday': '16:0-20:0', 'Saturday': '7:30-12:0', 'Sunday': '8:0-14:0'}"
wH6QxQv31IJ-qQ-FKIc5TA,Roman Plumbing,4.5,29,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppointmentOnly': 'True', 'BusinessAcceptsCreditCards': 'True'}","Plumbing, Water Heater Installation/Repair, Contractors, Home Services, Water Purification Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', 'Wednesday': '8:0-17:0', 'Thursday': '8:0-17:0', 'Friday': '8:0-17:0', 'Saturday': '8:0-17:0'}"
K1NdCTaJWK1ezkZpPyB7-A,Desert Rain Coffee,5.0,46,0,"{'BusinessAcceptsCreditCards': 'True', 'RestaurantsTakeOut': 'True', 'RestaurantsPriceRange2': '2', 'OutdoorSeating': 'True', 'BikeParking': 'True', 'Caters': 'False', 'WiFi': ""u'free'"", 'BusinessParking': ""{u'valet': False, u'garage': None, u'street': False, u'lot': True, u'validated': None}""}","Coffee & Tea, Food","{'Monday': '7:0-15:0', 'Tuesday': '7:0-15:0', 'Wednesday': '7:0-15:0', 'Thursday': '7:0-15:0', 'Friday': '7:0-15:0', 'Sunday': '8:0-12:0'}"
juL8ovMlnjkXNACxZ8HLAQ,Sam Levitz Furniture,2.0,186,1,"{'BusinessParking': ""{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}"", 'RestaurantsPriceRange2': '2', 'BusinessAcceptsCreditCards': 'True', 'BikeParking': 'False', 'RestaurantsTakeOut': 'None', 'RestaurantsDelivery': 'None'}","Shopping, Mattresses, Home & Garden, Furniture Stores","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0', 'Wednesday': '11:0-20:0', 'Thursday': '11:0-20:0', 'Friday': '11:0-20:0', 'Saturday': '11:0-20:0', 'Sunday': '11:0-18:0'}"
2eN2pfPCear_ofmgQ0peCw,"David Caplan, CPA",4.5,13,1,,"Professional Services, Accountants",


In [77]:
%%sql
EXPLAIN ANALYZE SELECT * FROM nor_business LIMIT 5;

QUERY PLAN
Limit (cost=0.00..0.21 rows=5 width=230) (actual time=0.009..0.011 rows=5 loops=1)
-> Seq Scan on nor_business (cost=0.00..104.80 rows=2480 width=230) (actual time=0.009..0.009 rows=5 loops=1)
Planning Time: 0.069 ms
Execution Time: 0.026 ms


In [78]:
df_business.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

In [79]:
## location
df_location = df_business[['business_id', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude']]
df_location

# Step 2: Connect to the PostgreSQL database
conn = get_db_connection()

# Step 3: Insert data into the table
bulk_insert(conn, 'location', df_location)

Successfully inserted 1000 rows into location.


In [83]:
%%sql
SELECT * FROM location LIMIT 10;

business_id,address,city,state,postal_code,latitude,longitude
UVfYRQIr6u_WrFSY5Cv7xw,736 Hanover Pl,Carmel,IN,46032,39.9697264,-86.1278146
wH6QxQv31IJ-qQ-FKIc5TA,6125 Grand Blvd,New Port Richey,FL,34652,28.247926,-82.719939
K1NdCTaJWK1ezkZpPyB7-A,1551 E Tangerine Rd,Oro Valley,AZ,85755,32.429167,-110.948631
juL8ovMlnjkXNACxZ8HLAQ,3750 W Orange Grove Rd,Tucson,AZ,85741,32.3248992,-111.0444852
2eN2pfPCear_ofmgQ0peCw,301 Andorra Glen Ct,Lafayette Hill,PA,19444,40.0884567,-75.2674119
oUnC0Mcl_nozqkOXejTQXQ,1875 S Alvernon Way,Tucson,AZ,85711,32.20042127,-110.9089026
sfOczyUZLffzSOVeVBbJrw,108 N Wayne Ave,Wayne,PA,19087,40.0444568,-75.3881712
toMVWIsPJmS7-LiLAf2AfA,3400 S Lawrence St,Philadelphia,PA,19148,39.9049727,-75.1588247
UcQUBTXcEke44ymSpX3qKw,,Boise,ID,83709,43.5516566,-116.2987901
pJTtJv3pNQxFr1CLRXj7xQ,2441 Rte 206,Eastampton,NJ,8060,39.9869773,-74.7354294


### Explore yelp_user

In [89]:
df_user.isnull().sum()
# from the result we can see there are a lot of null values for elite, so we can normalize that as well

user_id                   0
name                      0
review_count              0
yelping_since             0
useful                    0
funny                     0
cool                      0
elite                 34553
friends                   0
fans                      0
average_stars             0
compliment_hot            0
compliment_more           0
compliment_profile        0
compliment_cute           0
compliment_list           0
compliment_note           0
compliment_plain          0
compliment_cool           0
compliment_funny          0
compliment_writer         0
compliment_photos         0
dtype: int64

In [96]:
# Step 1: Split the 'elite' column into multiple rows
df_elite = df_user.assign(elite=df_user['elite'].str.split(',')).explode('elite')

# Step 2: Remove any extra spaces and drop NaN years
df_elite['elite'] = df_elite['elite'].str.strip()
df_elite = df_elite.dropna(subset=['elite'])

# Step 3: Drop duplicates in case of redundancy
df_elite = df_elite[['user_id', 'elite']].drop_duplicates()

# Step 4: Rename 'elite' to 'elite_year' 
df_elite.rename(columns={'elite': 'elite_year'}, inplace=True)

print(df_elite)


                      user_id elite_year
0      j14WgRoU_-2ZE1aw1dXrJg       2009
0      j14WgRoU_-2ZE1aw1dXrJg       2010
0      j14WgRoU_-2ZE1aw1dXrJg       2011
0      j14WgRoU_-2ZE1aw1dXrJg       2012
0      j14WgRoU_-2ZE1aw1dXrJg       2013
...                       ...        ...
41681  W1xuqnkt5o82LJ9w4bnAlw         20
41681  W1xuqnkt5o82LJ9w4bnAlw       2021
41873  O-iXRW6_2el1G-AwE5t1FA       2021
41886  Y0TYtVCQ3P7ZV50mZiBJkA         20
41886  Y0TYtVCQ3P7ZV50mZiBJkA       2021

[34199 rows x 2 columns]


In [99]:
### friends
# Normalize 'friends' column
df_friends = df_user[['user_id', 'friends']].copy()

# Split the 'friends' column by comma and explode into multiple rows
df_friends = df_friends.assign(friends=df_friends['friends'].str.split(',')).explode('friends')

# Clean: Remove extra spaces, NaN, and empty strings
df_friends['friends'] = df_friends['friends'].str.strip()  # Strip spaces
df_friends = df_friends.dropna(subset=['friends'])  # Drop NaN
df_friends = df_friends[df_friends['friends'] != '']  # Drop empty strings
df_friends = df_friends[df_friends['friends'] != 'None']

# Rename for clarity
df_friends.rename(columns={'friends': 'friend_id'}, inplace=True)

print(df_friends)


                      user_id               friend_id
0      j14WgRoU_-2ZE1aw1dXrJg  ueRPE0CX75ePGMqOFVj6IQ
0      j14WgRoU_-2ZE1aw1dXrJg  52oH4DrRvzzl8wh5UXyU0A
0      j14WgRoU_-2ZE1aw1dXrJg  E_GAXhVA1_lVC2aFpMQElA
0      j14WgRoU_-2ZE1aw1dXrJg  HwlpkOpidkZWvyjrxFk6Ag
0      j14WgRoU_-2ZE1aw1dXrJg  kuDmRGcvJhFCHEXTNH1d4Q
...                       ...                     ...
41975  kWGuleSbqm7DYJ4x43iH7Q  5X1-nf3_moBHqfhQd-5D0A
41996  btQA-pLfeZqFF74xmc935Q  -pkh2f3NLZFgTxeQMvcMPA
42018  Thy26dloVdr1f2a93gZiRw  3ifGF5eKfDsvpqPl_igwPQ
42125  NWf_iRg33goMvr_QsmAkdw  424nulIpc-FTeH6kfTIABg
42333  rj1ugAxFKI--fCMjekwZow  Ygc0mm-LHseMX_TGKElx8Q

[3761912 rows x 2 columns]


### Normalized yelp_user

In [109]:
# Connect to the PostgreSQL database
conn = get_db_connection()
cursor = conn.cursor()

# Full SQL schema definition
schema_sql = """
DROP TABLE IF EXISTS friends CASCADE;
DROP TABLE IF EXISTS elite CASCADE;
DROP TABLE IF EXISTS nor_user CASCADE;

CREATE TABLE IF NOT EXISTS nor_user (
    user_id VARCHAR(22) PRIMARY KEY,
    name TEXT,
    review_count INTEGER,
    yelping_since DATE,
    useful INTEGER,
    funny INTEGER,
    cool INTEGER,
    fans INTEGER,
    average_stars NUMERIC CHECK (average_stars BETWEEN 0 AND 5),
    compliment_hot INTEGER,
    compliment_more INTEGER,
    compliment_profile INTEGER,
    compliment_cute INTEGER,
    compliment_list INTEGER,
    compliment_note INTEGER,
    compliment_plain INTEGER,
    compliment_cool INTEGER,
    compliment_funny INTEGER,
    compliment_writer INTEGER,
    compliment_photos INTEGER
);

CREATE TABLE IF NOT EXISTS friends (
    user_id VARCHAR(22) REFERENCES nor_user(user_id) ON DELETE CASCADE,
    friend_id VARCHAR(22)
);

CREATE TABLE IF NOT EXISTS elite (
    user_id VARCHAR(22) REFERENCES nor_user(user_id) ON DELETE CASCADE,
    elite_year INTEGER
);


"""

# Execute the schema to create the tables
cursor.execute(schema_sql)

# Commit and close
conn.commit()
cursor.close()
conn.close()
print("Tables created successfully.")

Tables created successfully.


In [110]:
## insert nor_user
df_user_nor = df_user[['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos']]

# Step 2: Connect to the PostgreSQL database
conn = get_db_connection()

# Step 3: Insert data into the table
bulk_insert(conn, 'nor_user', df_user_nor, conflict_column='user_id')

Successfully inserted 42593 rows into nor_user.


In [111]:
df_elite.columns

Index(['user_id', 'elite_year'], dtype='object')

In [112]:
## insert elite
conn = get_db_connection()
bulk_insert(conn, 'elite', df_elite)

Successfully inserted 34199 rows into elite.


In [114]:
## insert friends
conn = get_db_connection()
bulk_insert(conn, 'friends', df_friends)

Successfully inserted 3761912 rows into friends.


# Query Performance Comparison