# 2. Data Analysis

In [8]:
import os
import json
import sqlite3
import pandas as pd

In [9]:
DB_PATH = "../data/database.db"
RAW_DIR = "../data/raw/"

def create_database_schema(db_path):
    """Create the database schema for movies"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Create movies table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS movies (
            movie_id INTEGER PRIMARY KEY,
            title TEXT NOT NULL,
            year INTEGER NOT NULL,
            popularity REAL,
            vote_average REAL,
            vote_count INTEGER,
            genre_ids TEXT,  -- Store as JSON string
            genres TEXT,     -- Store as JSON string
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    
    conn.commit()
    conn.close()
    print(f"✅ Database schema created at {db_path}")

def save_to_sqlite(df, db_path):
    """Save DataFrame to SQLite database"""
    # Ensure the directory exists
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    
    # Create database schema
    create_database_schema(db_path)
    
    # Convert list columns to JSON strings for SQLite storage
    df_copy = df.copy()
    df_copy['genre_ids'] = df_copy['genre_ids'].apply(json.dumps)
    df_copy['genres'] = df_copy['genres'].apply(json.dumps)
    
    # Save to SQLite
    conn = sqlite3.connect(db_path)
    df_copy.to_sql('movies', conn, if_exists='replace', index=False)
    conn.close()
    
    print(f"✅ Data saved to SQLite database: {db_path}")
    print(f"   Total records: {len(df)}")

# Load data from JSON file (assuming it was created by your data collection script)
json_file_path = f"{RAW_DIR}/movies_2020_2024.json"

# Read the JSON data into a DataFrame
df = pd.read_json(json_file_path)

# Save to SQLite database
save_to_sqlite(df, DB_PATH)

# Optional: Verify the data was saved correctly
def verify_database(db_path):
    """Verify the data was saved correctly"""
    conn = sqlite3.connect(db_path)
    
    # Get table info
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM movies")
    count = cursor.fetchone()[0]
    
    cursor.execute("SELECT * FROM movies LIMIT 3")
    sample_rows = cursor.fetchall()
    
    conn.close()
    
    print(f"\n📊 Database verification:")
    print(f"   Total movies in database: {count}")
    print(f"   Sample data (first 3 rows):")
    for i, row in enumerate(sample_rows, 1):
        print(f"   {i}. {row[1]} ({row[2]}) - Rating: {row[4]}")

# Uncomment to verify
# verify_database(DB_PATH)

✅ Database schema created at ../data/database.db
✅ Data saved to SQLite database: ../data/database.db
   Total records: 375


In [10]:
def query_with_sqlite3():
    """Using raw SQL queries with sqlite3"""
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    
    # Example 1: Get all movies from 2024
    cursor.execute("SELECT * FROM movies WHERE year = 2024")
    movies_2024 = cursor.fetchall()
    print(f"Found {len(movies_2024)} movies from 2024")
    
    # Example 2: Get top 10 most popular movies
    cursor.execute("""
        SELECT title, year, popularity, vote_average 
        FROM movies 
        ORDER BY popularity DESC 
        LIMIT 10
    """)
    top_movies = cursor.fetchall()
    print("\nTop 10 most popular movies:")
    for movie in top_movies:
        print(f"  {movie[0]} ({movie[1]}) - Popularity: {movie[2]:.1f}")
    
    # Example 3: Get movies with high ratings (>= 7.5)
    cursor.execute("""
        SELECT title, year, vote_average, vote_count
        FROM movies 
        WHERE vote_average >= 7.5 
        ORDER BY vote_average DESC
    """)
    high_rated = cursor.fetchall()
    print(f"\nFound {len(high_rated)} highly rated movies (>= 7.5)")

    # Example 4: Get top 10 most rated movies by vote count
    cursor.execute("""
        SELECT title, year, vote_count, vote_average
        FROM movies
        ORDER BY vote_count DESC
        LIMIT 10
    """)
    most_rated = cursor.fetchall()
    print("\nTop 10 most rated movies:")
    for movie in most_rated:
        print(f"  {movie[0]} ({movie[1]}) - Votes: {movie[2]}, Rating: {movie[3]:.1f}")
    
    conn.close()

query_with_sqlite3()


Found 75 movies from 2024

Top 10 most popular movies:
  My Fault (2023) - Popularity: 63.4
  Moana 2 (2024) - Popularity: 59.8
  Sonic the Hedgehog 3 (2024) - Popularity: 44.5
  Deadpool & Wolverine (2024) - Popularity: 41.6
  Your Fault (2024) - Popularity: 40.7
  Venom: The Last Dance (2024) - Popularity: 38.7
  365 Days: This Day (2022) - Popularity: 38.6
  Despicable Me 4 (2024) - Popularity: 37.4
  Jurassic World Dominion (2022) - Popularity: 37.1
  Inside Out 2 (2024) - Popularity: 36.6

Found 88 highly rated movies (>= 7.5)

Top 10 most rated movies:
  Spider-Man: No Way Home (2021) - Votes: 20937, Rating: 7.9
  Dune (2021) - Votes: 13756, Rating: 7.8
  Avatar: The Way of Water (2022) - Votes: 12714, Rating: 7.6
  The Batman (2022) - Votes: 11085, Rating: 7.7
  Soul (2020) - Votes: 10859, Rating: 8.1
  Venom: Let There Be Carnage (2021) - Votes: 10749, Rating: 6.8
  Black Widow (2021) - Votes: 10675, Rating: 7.2
  Oppenheimer (2023) - Votes: 10502, Rating: 8.1
  Birds of Prey (