# 2. Data Analysis


## 🛠️ Design Decisions & Data Cleaning

- **Database schema**: A single `movies` table stores core metadata (`movie_id`, `title`, `year`, `vote_average`, etc.).
  - List-type fields like `genre_ids` and `genres` are stored as JSON strings for flexibility.
  - `created_at` adds a timestamp for traceability.

- **Storage format**: Cleaned data is saved to a local **SQLite** database (`data/database.db`) for reproducibility and easy querying.

- **Cleaning steps**:
  - Loaded raw JSON into a `pandas` DataFrame.
  - Converted list columns (`genre_ids`, `genres`) to JSON strings to preserve structure.
  - Ensured output directory exists and overwrote existing data with `if_exists='replace'`.

- **Validation**: A verification function checks total row count and displays sample entries to confirm successful storage.


In [3]:
import os
import json
import sqlite3
import pandas as pd

In [13]:
DB_PATH = "../data/database.db"
RAW_DIR = "../data/raw/"

def create_database_schema(db_path):
    """Create the database schema for movies"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Create movies table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS movies (
            movie_id INTEGER PRIMARY KEY,
            title TEXT NOT NULL,
            year INTEGER NOT NULL,
            popularity REAL,
            vote_average REAL,
            vote_count INTEGER,
            genre_ids TEXT,  -- Store as JSON string
            genres TEXT,     -- Store as JSON string
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    
    conn.commit()
    conn.close()
    print(f"✅ Database schema created at {db_path}")

def save_to_sqlite(df, db_path):
    """Save DataFrame to SQLite database"""
    # Ensure the directory exists
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    
    # Create database schema
    create_database_schema(db_path)
    
    # Convert list columns to JSON strings for SQLite storage
    df_copy = df.copy()
    df_copy['genre_ids'] = df_copy['genre_ids'].apply(json.dumps)
    df_copy['genres'] = df_copy['genres'].apply(json.dumps)
    
    # Save to SQLite
    conn = sqlite3.connect(db_path)
    df_copy.to_sql('movies', conn, if_exists='replace', index=False)
    conn.close()
    
    print(f"✅ Data saved to SQLite database: {db_path}")
    print(f"   Total records: {len(df)}")

# Load data from JSON file (assuming it was created by your data collection script)
json_file_path = f"{RAW_DIR}/movies_2020_2024.json"

# Read the JSON data into a DataFrame
df = pd.read_json(json_file_path)

# Save to SQLite database
save_to_sqlite(df, DB_PATH)

# Optional: Verify the data was saved correctly
def verify_database(db_path):
    """Verify the data was saved correctly"""
    conn = sqlite3.connect(db_path)
    
    # Get table info
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM movies")
    count = cursor.fetchone()[0]
    
    cursor.execute("SELECT * FROM movies LIMIT 3")
    sample_rows = cursor.fetchall()
    
    conn.close()
    
    print(f"\n📊 Database verification:")
    print(f"   Total movies in database: {count}")
    print(f"   Sample data (first 3 rows):")
    for i, row in enumerate(sample_rows, 1):
        print(f"   {i}. {row[1]} ({row[2]}) - Rating: {row[4]}")


✅ Database schema created at ../data/database.db
✅ Data saved to SQLite database: ../data/database.db
   Total records: 375


In [14]:
import sqlite3

DB_PATH = "../data/database.db"  # update if needed

def query_with_sqlite3():
    """Using raw SQL queries with sqlite3"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM movies")
    total_movies = cursor.fetchone()[0]
    print(f"Total number of movies in the database: {total_movies}")
    
    cursor.execute("SELECT COUNT(DISTINCT movie_id) FROM movies")
    unique_movies = cursor.fetchone()[0]
    print(f"Total number of unique movies by movie_id: {unique_movies}")

    cursor.execute("SELECT COUNT(DISTINCT title) FROM movies")
    unique_titles = cursor.fetchone()[0]
    print(f"Total number of unique movies by title: {unique_titles}")

    # Movies with the lowest vote_count
    cursor.execute("""
        SELECT title, vote_count
        FROM movies
        WHERE vote_count = (SELECT MIN(vote_count) FROM movies)
    """)
    lowest_vote_movies = cursor.fetchall()
    print("\nMovies with the lowest number of votes:")
    for title, votes in lowest_vote_movies:
        print(f"{title}: {votes}")

    # Movies with the highest vote_count
    cursor.execute("""
        SELECT title, vote_count
        FROM movies
        WHERE vote_count = (SELECT MAX(vote_count) FROM movies)
    """)
    highest_vote_movies = cursor.fetchall()
    print("\nMovie with the highest number of votes:")
    for title, votes in highest_vote_movies:
        print(f"{title}: {votes}")

    conn.close()

query_with_sqlite3()


Total number of movies in the database: 375
Total number of unique movies by movie_id: 375
Total number of unique movies by title: 375

Movies with the lowest number of votes:
A Complete Unknown: 1141

Movies with the highest number of votes:
Spider-Man: No Way Home: 20939
