# 01 — Data Preparation

Load , filter to 2008–2012, build SQLite database with normalised schema.

In [1]:
import sys, json, sqlite3, time
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd()
if (PROJECT_ROOT / "src").exists():
    sys.path.insert(0, str(PROJECT_ROOT))
else:
    PROJECT_ROOT = PROJECT_ROOT.parent
    sys.path.insert(0, str(PROJECT_ROOT))

from src.utils import *
from src.data_processing import build_database, build_sample_database

## 1.1 Inspect Raw JSON

In [2]:
# Peek at first 3 records
with open(REVIEW_JSON_PATH) as f:
    for i, line in enumerate(f):
        if i >= 3:
            break
        r = json.loads(line)
        print(json.dumps(r, indent=2, ensure_ascii=False)[:600])
        print("---")

{
  "ratings": {
    "service": 5.0,
    "cleanliness": 5.0,
    "overall": 5.0,
    "value": 5.0,
    "location": 5.0,
    "sleep_quality": 5.0,
    "rooms": 5.0
  },
  "title": "“Truly is \"Jewel of the Upper Wets Side\"”",
  "text": "Stayed in a king suite for 11 nights and yes it cots us a bit but we were happy with the standard of room, the location and the friendliness of the staff. Our room was on the 20th floor overlooking Broadway and the madhouse of the Fairway Market. Room was quite with no noise evident from the hallway or adjoining rooms. It was great to be able to open windows wh
---
{
  "ratings": {
    "service": 5.0,
    "cleanliness": 5.0,
    "overall": 5.0,
    "value": 5.0,
    "location": 5.0,
    "sleep_quality": 5.0,
    "rooms": 5.0
  },
  "title": "“My home away from home!”",
  "text": "On every visit to NYC, the Hotel Beacon is the place we love to stay. So conveniently located to Central Park, Lincoln Center and great local restaurants. The rooms are lovely 

## 1.2 Build the Full Database

This reads  line-by-line, filters to years 2008–2012, and inserts into a normalised SQLite DB with three tables: , , .

In [4]:
# Build the database (this may take a few minutes for ~550K reviews)
t0 = time.time()
total = build_database()
print(f"Inserted {total:,} reviews in {time.time()-t0:.1f}s")

[data_processing] Reading /Users/adrian/Workshop/5126/Assignment1/review.json …
  … 10,000 reviews inserted (10,728 lines scanned)
  … 20,000 reviews inserted (22,185 lines scanned)
  … 30,000 reviews inserted (32,598 lines scanned)
  … 40,000 reviews inserted (44,540 lines scanned)
  … 50,000 reviews inserted (55,966 lines scanned)
  … 60,000 reviews inserted (68,136 lines scanned)
  … 70,000 reviews inserted (80,562 lines scanned)
  … 80,000 reviews inserted (93,199 lines scanned)
  … 90,000 reviews inserted (107,094 lines scanned)
  … 100,000 reviews inserted (119,268 lines scanned)
  … 110,000 reviews inserted (131,364 lines scanned)
  … 120,000 reviews inserted (142,586 lines scanned)
  … 130,000 reviews inserted (153,988 lines scanned)
  … 140,000 reviews inserted (165,670 lines scanned)
  … 150,000 reviews inserted (177,723 lines scanned)
  … 160,000 reviews inserted (190,385 lines scanned)
  … 170,000 reviews inserted (202,447 lines scanned)
  … 180,000 reviews inserted (213,42

## 1.3 Verify Database

In [8]:
conn = get_db_connection()
for table in ["hotels", "authors", "reviews"]:
    count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
    print(f"{table:>10s}: {count:,} rows")
conn.close()

    hotels: 3,888 rows
   authors: 522,156 rows
   reviews: 753,445 rows


## 1.4 Schema Information

In [9]:
conn = get_db_connection()
cursor = conn.execute("SELECT sql FROM sqlite_master WHERE type='table'")
for (ddl,) in cursor:
    print(ddl)
    print()
conn.close()

CREATE TABLE hotels (
    hotel_id   INTEGER PRIMARY KEY,   -- offering_id from the source data
    num_reviews INTEGER DEFAULT 0      -- denormalized review count
)

CREATE TABLE authors (
    author_id          TEXT PRIMARY KEY,   -- unique hash id
    username           TEXT,
    location           TEXT,
    num_cities         INTEGER,
    num_helpful_votes  INTEGER,
    num_reviews        INTEGER,
    num_type_reviews   INTEGER
)

CREATE TABLE reviews (
    review_id           INTEGER PRIMARY KEY,
    hotel_id            INTEGER NOT NULL,
    author_id           TEXT,
    title               TEXT,
    text                TEXT,
    date                TEXT,               -- original string, e.g. 'December 17, 2012'
    date_parsed         TEXT,               -- ISO format YYYY-MM-DD
    year                INTEGER,
    month               INTEGER,
    date_stayed         TEXT,
    rating_service      REAL,
    rating_cleanliness  REAL,
    rating_overall      REAL,
    rating_value 

## 1.5 Data Quality Checks

In [11]:
conn = get_db_connection()
df = pd.read_sql_query("SELECT * FROM reviews LIMIT 50000", conn)
print("Shape:", df.shape)
print("Null counts:")
print(df.isnull().sum())
print("Year distribution:")
print(df["year"].value_counts().sort_index())
conn.close()

Shape: (50000, 19)
Null counts:
review_id                   0
hotel_id                    0
author_id                   0
title                       0
text                        0
date                        0
date_parsed                 0
year                        0
month                       0
date_stayed              2348
rating_service           6593
rating_cleanliness       5026
rating_overall              0
rating_value             9358
rating_location          8956
rating_sleep_quality    50000
rating_rooms             4749
num_helpful_votes           0
via_mobile                  0
dtype: int64
Year distribution:
year
2008    50000
Name: count, dtype: int64


## 1.6 Build Sample Database for TAs

In [13]:
count = build_sample_database()
print(f"Sample DB created with {count:,} reviews")

[data_processing] Sample DB → 5,500 reviews → /Users/adrian/Workshop/5126/Assignment1/data/reviews_sample.db
Sample DB created with 5,500 reviews


## 1.7 Summary Statistics

In [14]:
conn = get_db_connection()
stats = pd.read_sql_query("""
    SELECT
        COUNT(*) AS total_reviews,
        COUNT(DISTINCT hotel_id) AS total_hotels,
        COUNT(DISTINCT author_id) AS total_authors,
        MIN(date_parsed) AS earliest_date,
        MAX(date_parsed) AS latest_date,
        ROUND(AVG(rating_overall), 2) AS avg_overall_rating
    FROM reviews
""", conn)
print(stats.T.to_string())
conn.close()

                             0
total_reviews           753445
total_hotels              3888
total_authors           522157
earliest_date       2008-01-01
latest_date         2012-12-20
avg_overall_rating        3.98
