In [1]:
import sqlite3
import config as cfg
import pandas as pd

In [2]:
from tqdm.notebook import tqdm

tqdm.pandas()

# reviews data

In [3]:
with cfg.PATHS.RAW_REVIEWS.open() as f:
    df = pd.read_json(f, lines=True)

df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [4]:
df = df[["review_id", "user_id", "business_id", "stars", "text", "useful", "funny", "cool", "date"]]

In [5]:
# convert date field to timedelta
df["date"] = pd.to_datetime(df["date"])
df["review_age"] = df["date"].progress_apply(lambda x: (cfg.DATE_COLLECTED - x).days)
df.drop(columns=["date"], inplace=True)

  0%|          | 0/6990280 [00:00<?, ?it/s]

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df["review_stars"] = df["stars"]
df.drop(columns=["stars"], inplace=True)

In [8]:
df.to_sql("reviews", con=sqlite3.connect(cfg.PATHS.DB), if_exists="replace", index=False)

6990280

In [9]:
del df

# business data

In [10]:
with cfg.PATHS.RAW_BUSINESS.open() as f:
    df = pd.read_json(f, lines=True)

df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [11]:
df = df[["business_id", "city", "stars", "categories"]]

In [12]:
df["categories_list_string"] = df["categories"].progress_apply(str)

  0%|          | 0/150346 [00:00<?, ?it/s]

In [13]:
df.drop(columns=["categories"], inplace=True)

In [14]:
df.to_sql("business", con=sqlite3.connect(cfg.PATHS.DB), if_exists="replace", index=False)

150346

In [15]:
del df

# user data

In [16]:
with cfg.PATHS.RAW_USER.open() as f:
    df = pd.read_json(f, lines=True)

df.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0


In [17]:
df = df[["user_id", "review_count", "yelping_since", "fans"]]

In [18]:
df["account_age"] = (cfg.DATE_COLLECTED - pd.to_datetime(df["yelping_since"])).dt.days

In [19]:
df.drop(columns=["yelping_since"], inplace=True)

In [20]:
df.to_sql("users", con=sqlite3.connect(cfg.PATHS.DB), if_exists="replace", index=False)

1987897

In [21]:
del df

# join reviews to business and user data

In [22]:
df = pd.read_sql("SELECT * FROM reviews JOIN business ON reviews.business_id = business.business_id JOIN users ON reviews.user_id = users.user_id", con=sqlite3.connect(cfg.PATHS.DB))

In [23]:
df = df.sample(frac=cfg.proportion, random_state=42)

In [24]:
cfg.proportion

0.2

In [25]:
df.drop(columns=["business_id", "user_id", "review_id"], inplace=True)

In [26]:
df.head()

Unnamed: 0,text,useful,funny,cool,review_age,review_stars,city,stars,categories_list_string,review_count,fans,account_age
5527377,A cheaper alternative to a full blown AYCE hot...,17,3,7,1163,4,Philadelphia,4.0,"Taiwanese, Hot Pot, Restaurants, Bubble Tea, Food",140,6,3602
389000,We liked this place so much I ate there again ...,1,0,1,1299,5,Tucson,4.0,"Restaurants, Salad, Pakistani, Indian, Cocktai...",31,0,4094
556758,"Pretty limited menu, but good authentic food. ...",2,1,1,1577,5,Philadelphia,4.0,"Halal, Restaurants, Mexican, Steakhouses",21,0,3902
4074689,I have a 2014 Ford Escape and a very strange t...,0,0,0,2480,4,Tampa,3.5,"Automotive, Transmission Repair, Car Dealers, ...",1,0,2480
3687671,I've been a member of CrossFit En Fuego for ne...,3,1,1,3015,5,Land O Lakes,5.0,"Interval Training Gyms, Trainers, Fitness & In...",145,7,3791


# output results as pickled dataframe

In [27]:
df.to_pickle(cfg.PATHS.CLEAN_DATA_PICKLE)