In [None]:
# Magic cell to establish database connection
import sqlite3

import polars as pl


conn = sqlite3.connect(r"F:\OF.DL\__user_data__\sites\OnlyFans\jameswithlola\Metadata\user_data.db")
cursor = conn.cursor()
print("Database connected successfully!")

# First, let's examine the profiles table to understand what data is available
print("\nInvestigating profiles table:")
cursor.execute("SELECT COUNT(*) FROM profiles")
profile_count = cursor.fetchone()[0]
print(f"Total profiles: {profile_count}")

cursor.execute("SELECT COUNT(*) FROM profiles WHERE user_id IS NOT NULL")
profile_with_user_id = cursor.fetchone()[0]
print(f"Profiles with user_id: {profile_with_user_id}")

cursor.execute("SELECT COUNT(*) FROM profiles WHERE username IS NOT NULL")
profile_with_username = cursor.fetchone()[0]
print(f"Profiles with username: {profile_with_username}")

# Sample a few rows to see the data structure
cursor.execute("SELECT * FROM profiles LIMIT 5")
sample_profiles = cursor.fetchall()
print(f"Sample profiles: {sample_profiles}")

# Create DataFrames with explicit queries
df_messages = pl.read_database(
    query="""
    SELECT 
        id,
        post_id,
        CAST(COALESCE(user_id, 0) AS INTEGER) as user_id,
        text,
        price,
        paid,
        archived,
        created_at
    FROM messages
    """,
    connection=conn
)

df_posts = pl.read_database(
    query="""
    SELECT 
        id,
        post_id,
        text,
        price,
        paid,
        archived,
        created_at
    FROM posts
    """,
    connection=conn
)

df_stories = pl.read_database(
    query="""
    SELECT 
        id,
        post_id,
        text,
        price,
        paid,
        archived,
        created_at
    FROM stories
    """,
    connection=conn
)

df_products = pl.read_database(
    query="""
    SELECT 
        id,
        post_id,
        text,
        price,
        paid,
        archived,
        created_at,
        title
    FROM products
    """,
    connection=conn
)

# Fix profiles query - remove WHERE clause and handle nulls properly
df_profiles = pl.read_database(
    query="""
    SELECT 
        id,
        CAST(COALESCE(user_id, 0) AS INTEGER) as user_id,
        COALESCE(username, 'unknown') as username
    FROM profiles
    """,
    connection=conn
)

df_medias = pl.read_database(
    query="""
    SELECT 
        id,
        media_id,
        post_id,
        link,
        directory,
        filename,
        size,
        api_type,
        media_type,
        preview,
        linked,
        downloaded,
        created_at
    FROM medias
    """,
    connection=conn
)

# Print schemas to verify data types
print("\nMessages Schema:")
print(df_messages.schema)
print("\nProfiles Schema:")
print(df_profiles.schema)
print(f"Profiles DataFrame shape: {df_profiles.shape}")

# Example 1: Get all posts with their associated media files
posts_with_media = (
    df_posts
    .join(df_medias, left_on="post_id", right_on="post_id", how="left")
    .select([
        "post_id",
        "text",
        "price",
        "created_at",
        "filename",
        "media_type",
        "downloaded"
    ])
)

print("\nExample 1: Posts with their media files:")
print(posts_with_media.head())

# Example 2: Get all messages with user information (now fixed)
if df_profiles.shape[0] > 0:
    messages_with_users = (
        df_messages
        .join(
            df_profiles.select(["user_id", "username"]),
            on="user_id",
            how="left"
        )
        .select([
            "id",
            "text",
            "username",
            "created_at",
            "price",
            "paid"
        ])
    )
    print("\nExample 2: Messages with user information:")
    print(messages_with_users.head())
else:
    print("\nExample 2: Skipping messages with user info - no profiles available")
    messages_with_users = df_messages.with_columns(pl.lit("unknown").alias("username"))

# Example 3: Get media statistics
media_stats = (
    df_medias
    .group_by("media_type")
    .agg([
        pl.count("media_id").alias("total_files"),
        pl.sum("size").alias("total_size"),
        pl.mean("size").alias("avg_size"),
        pl.sum("downloaded").alias("downloaded_count")
    ])
)

print("\nExample 3: Media statistics by type:")
print(media_stats)

# Example 4: Get content timeline (posts, stories, messages) with their media counts
print(f"\nDebugging content table shapes:")
print(f"Posts: {df_posts.shape}")
print(f"Stories: {df_stories.shape}")
print(f"Messages: {df_messages.shape}")
print(f"Products: {df_products.shape}")

# Check if stories table has any data
if df_stories.shape[0] > 0:
    print(f"Stories schema: {df_stories.schema}")
    print(f"Stories sample:\n{df_stories.head()}")
else:
    print("Stories table is empty!")

def get_content_with_media_count(content_df, content_type):
    # Skip empty DataFrames
    if content_df.shape[0] == 0:
        print(f"Skipping empty {content_type} table")
        return pl.DataFrame({
            "post_id": [],
            "text": [],
            "created_at": [],
            "media_count": [],
            "total_media_size": [],
            "content_type": []
        }, schema={
            "post_id": pl.Int64,
            "text": pl.String,
            "created_at": pl.String,
            "media_count": pl.UInt32,
            "total_media_size": pl.Int64,
            "content_type": pl.String
        })

    return (
        content_df
        .join(df_medias, left_on="post_id", right_on="post_id", how="left")
        .group_by(["post_id", "text", "created_at"])
        .agg([
            pl.count("media_id").alias("media_count"),
            pl.sum("size").alias("total_media_size")
        ])
        .with_columns(pl.lit(content_type).alias("content_type"))
    )

# Build timeline with only non-empty content types
timeline_parts = []

if df_posts.shape[0] > 0:
    timeline_parts.append(get_content_with_media_count(df_posts, "post"))

if df_stories.shape[0] > 0:
    timeline_parts.append(get_content_with_media_count(df_stories, "story"))

if df_messages.shape[0] > 0:
    timeline_parts.append(get_content_with_media_count(df_messages, "message"))

if df_products.shape[0] > 0:
    timeline_parts.append(get_content_with_media_count(df_products, "product"))

if timeline_parts:
    timeline = pl.concat(timeline_parts).sort("created_at", descending=True)
    print("\nExample 4: Content timeline with media counts:")
    print(timeline.head())
else:
    print("\nExample 4: No content available for timeline")

In [None]:
posts_with_media.filter(pl.col("filename").str.contains("OnlyFans - jameswithlola - 2023-02-18 - 520622078 - 0hb797ocpfel32t0z87dl_source.mp4"))

In [None]:
# Magic cell to list all tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("\nAvailable tables in the database:")
for table in tables:
    print(table[0])

print("\nDatabase Schema:")
print("-" * 50)
for table in tables:
    table_name = table[0]
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = cursor.fetchall()

    print(f"\nTable: {table_name}")
    print("-" * len(f"Table: {table_name}"))
    for col in columns:
        # col contains: (id, name, type, notnull, default_value, pk)
        print(
            f"Column: {col[1]:<20} Type: {col[2]:<10} {'Primary Key' if col[5] else ''}"
        )
    print()

In [None]:
# Additional analysis using the DataFrames created in the first cell

# Let's explore the data further
print("\nDataFrame shapes:")
print(f"Messages: {df_messages.shape}")
print(f"Posts: {df_posts.shape}")
print(f"Stories: {df_stories.shape}")
print(f"Products: {df_products.shape}")
print(f"Profiles: {df_profiles.shape}")
print(f"Medias: {df_medias.shape}")

# Check unique user_ids in messages vs profiles
print(f"\nUnique user_ids in messages: {df_messages['user_id'].n_unique()}")
print(f"Unique user_ids in profiles: {df_profiles['user_id'].n_unique()}")

# Content type analysis
print(f"\nContent breakdown:")
print(f"Posts: {df_posts.shape[0]}")
print(f"Stories: {df_stories.shape[0]}")
print(f"Messages: {df_messages.shape[0]}")
print(f"Products: {df_products.shape[0]}")

# Media file analysis
print(f"\nMedia files:")
print(f"Total media files: {df_medias.shape[0]}")
print(f"Downloaded files: {df_medias.filter(pl.col('downloaded') == 1).shape[0]}")

# Timeline analysis - most recent content
print(f"\nMost recent posts:")
recent_posts = df_posts.sort("created_at", descending=True).head(3)
print(recent_posts.select(["post_id", "text", "created_at"]))

print(f"\nMost recent messages:")
recent_messages = df_messages.sort("created_at", descending=True).head(3)
print(recent_messages.select(["id", "text", "created_at", "price"]))
