# Introduction

### Project initialization and setup

Importing all of the libraries that will be used. In the project.

In [3]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Display options (make this clearer)

In [4]:

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

Explain what this is

In [5]:
DB_PATH = "viewer_interactions.db"

try:
    conn = sqlite3.connect(DB_PATH)
    print("Connected successfully!")
except sqlite3.Error as e:
    print("Connection failed:", e)

Connected successfully!


Listing all the tables

In [6]:
tables_query = """
               SELECT name
               FROM sqlite_master
               WHERE type='table'
               ORDER BY name; \
               """

tables_df = pd.read_sql_query(tables_query, conn)
print("Tables in the database:")
display(tables_df)

Tables in the database:


Unnamed: 0,name
0,data_dictionary
1,movie_statistics
2,movies
3,sqlite_stat1
4,sqlite_stat4
5,user_statistics
6,viewer_ratings


In [7]:
table_names = tables_df["name"].tolist()

schemas = {}

for table in table_names:
    pragma_query = f"PRAGMA table_info({table});"
    schema_df = pd.read_sql_query(pragma_query, conn)
    schemas[table] = schema_df
    print(f"\nSchema for table '{table}':")
    display(schema_df)


Schema for table 'data_dictionary':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,table_name,TEXT,0,,0
1,1,field_name,TEXT,0,,0
2,2,data_type,TEXT,0,,0
3,3,description,TEXT,0,,0



Schema for table 'movie_statistics':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,movie_id,INTEGER,0,,0
1,1,total_ratings,REAL,0,,0
2,2,avg_rating,REAL,0,,0
3,3,std_rating,REAL,0,,0
4,4,min_rating,REAL,0,,0
5,5,max_rating,REAL,0,,0
6,6,unique_users,REAL,0,,0
7,7,first_rating_date,TEXT,0,,0
8,8,last_rating_date,TEXT,0,,0
9,9,year_of_release,REAL,0,,0



Schema for table 'movies':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,movie_id,INTEGER,0,,0
1,1,year_of_release,REAL,0,,0
2,2,title,TEXT,0,,0



Schema for table 'sqlite_stat1':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,tbl,,0,,0
1,1,idx,,0,,0
2,2,stat,,0,,0



Schema for table 'sqlite_stat4':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,tbl,,0,,0
1,1,idx,,0,,0
2,2,neq,,0,,0
3,3,nlt,,0,,0
4,4,ndlt,,0,,0
5,5,sample,,0,,0



Schema for table 'user_statistics':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,customer_id,INTEGER,0,,0
1,1,total_ratings,REAL,0,,0
2,2,avg_rating,REAL,0,,0
3,3,std_rating,REAL,0,,0
4,4,min_rating,REAL,0,,0
5,5,max_rating,REAL,0,,0
6,6,unique_movies,REAL,0,,0
7,7,first_rating_date,TEXT,0,,0
8,8,last_rating_date,TEXT,0,,0
9,9,activity_days,REAL,0,,0



Schema for table 'viewer_ratings':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,movie_id,INTEGER,0,,0
1,1,customer_id,INTEGER,0,,0
2,2,rating,REAL,0,,0
3,3,date,TEXT,0,,0
4,4,anomalous_date,INTEGER,0,,0


Creating a dictionary of type table_name -> DataFrame

In [None]:
# dfs = {
#    "interactions": DataFrame with columns [user_id, movie_id, rating, timestamp, ...],
#    "movies":       DataFrame with columns [movie_id, title, genres, year, ...],
#    "users":        DataFrame with columns [user_id, age, country, ...]
# }

Data frame shape where shape is the number of rows and the second number is the number of columns. We are specifically grabbing the names of the sets of tables

In [9]:
dfs = {}

for t in table_names:
    df = pd.read_sql_query(f"SELECT * FROM {t};", conn)
    dfs[t] = df
    print(f"\nLoaded table '{t}' with shape {df.shape}")


Loaded table 'data_dictionary' with shape (31, 4)

Loaded table 'movie_statistics' with shape (16015, 11)

Loaded table 'movies' with shape (18008, 3)

Loaded table 'sqlite_stat1' with shape (5, 3)

Loaded table 'sqlite_stat4' with shape (0, 6)

Loaded table 'user_statistics' with shape (438780, 10)

Loaded table 'viewer_ratings' with shape (4025000, 5)


Example of using the dfs dictionary

In [25]:
movies = dfs['movies']
movie = movies[movies['year_of_release'] == 2002]
print(movie)


       movie_id  year_of_release                                       title
20           21           2002.0                           Strange Relations
27           28           2002.0                             Lilo and Stitch
41           42           2002.0                      Searching for Paradise
50           51           2002.0  Jonah: A VeggieTales Movie: Bonus Material
51           52           2002.0                     The Weather Underground
...         ...              ...                                         ...
17865     18477           2002.0                                 First Woman
17916     20084           2002.0                               Silent Master
17941     20713           2002.0                                  My Warrior
17950     19615           2002.0                                   Dead Life
18001     19738           2002.0                                First Island

[1318 rows x 3 columns]


Giacomo thing

In [None]:
    tables = pd.read_sql(
        "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';",
        conn
    )['name'].tolist()

    print("=== DATA DICTIONARY ===\n")

    for table in tables:
        print(f"Table: {table}")
        print("-" * (7 + len(table)))

        # Get actual column info from PRAGMA but filter to nice output
        schema = pd.read_sql(f"PRAGMA table_info('{table}')", conn)

        # Keep only real schema fields you want (remove cid, default, pk if desired)
        clean_schema = schema[['name', 'type', ]]

        print(clean_schema.to_string(index=False))
        print("\n")
    

=== DATA DICTIONARY ===

Table: viewer_ratings
---------------------
          name    type
      movie_id INTEGER
   customer_id INTEGER
        rating    REAL
          date    TEXT
anomalous_date INTEGER


Table: movies
-------------
           name    type
       movie_id INTEGER
year_of_release    REAL
          title    TEXT


Table: user_statistics
----------------------
             name    type
      customer_id INTEGER
    total_ratings    REAL
       avg_rating    REAL
       std_rating    REAL
       min_rating    REAL
       max_rating    REAL
    unique_movies    REAL
first_rating_date    TEXT
 last_rating_date    TEXT
    activity_days    REAL


Table: movie_statistics
-----------------------
             name    type
         movie_id INTEGER
    total_ratings    REAL
       avg_rating    REAL
       std_rating    REAL
       min_rating    REAL
       max_rating    REAL
     unique_users    REAL
first_rating_date    TEXT
 last_rating_date    TEXT
  year_of_release    RE