# Introduction

### Project initialization and setup

Importing all of the libraries that will be used. In the project.

In [34]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Display options (make this clearer)

In [35]:

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

Explain what this is

In [36]:
DB_PATH = "viewer_interactions.db"

try:
    conn = sqlite3.connect(DB_PATH)
    print("Connected successfully!")
except sqlite3.Error as e:
    print("Connection failed:", e)

Connected successfully!


Listing all the tables

In [37]:
tables_query = """
               SELECT name
               FROM sqlite_master
               WHERE type='table'
               ORDER BY name; \
               """

tables_df = pd.read_sql_query(tables_query, conn)
print("Tables in the database:")
display(tables_df)

Tables in the database:


Unnamed: 0,name
0,data_dictionary
1,movie_statistics
2,movies
3,user_statistics
4,viewer_ratings


In [38]:
table_names = tables_df["name"].tolist()

schemas = {}

for table in table_names:
    pragma_query = f"PRAGMA table_info({table});"
    schema_df = pd.read_sql_query(pragma_query, conn)
    schemas[table] = schema_df
    print(f"\nSchema for table '{table}':")
    display(schema_df)


Schema for table 'data_dictionary':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,table_name,TEXT,0,,0
1,1,field_name,TEXT,0,,0
2,2,data_type,TEXT,0,,0
3,3,description,TEXT,0,,0



Schema for table 'movie_statistics':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,movie_id,INTEGER,0,,0
1,1,total_ratings,REAL,0,,0
2,2,avg_rating,REAL,0,,0
3,3,std_rating,REAL,0,,0
4,4,min_rating,REAL,0,,0
5,5,max_rating,REAL,0,,0
6,6,unique_users,REAL,0,,0
7,7,first_rating_date,TEXT,0,,0
8,8,last_rating_date,TEXT,0,,0
9,9,year_of_release,REAL,0,,0



Schema for table 'movies':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,movie_id,INTEGER,0,,0
1,1,year_of_release,REAL,0,,0
2,2,title,TEXT,0,,0



Schema for table 'user_statistics':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,customer_id,INTEGER,0,,0
1,1,total_ratings,REAL,0,,0
2,2,avg_rating,REAL,0,,0
3,3,std_rating,REAL,0,,0
4,4,min_rating,REAL,0,,0
5,5,max_rating,REAL,0,,0
6,6,unique_movies,REAL,0,,0
7,7,first_rating_date,TEXT,0,,0
8,8,last_rating_date,TEXT,0,,0
9,9,activity_days,REAL,0,,0



Schema for table 'viewer_ratings':


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,movie_id,INTEGER,0,,0
1,1,customer_id,INTEGER,0,,0
2,2,rating,REAL,0,,0
3,3,date,TEXT,0,,0
4,4,anomalous_date,INTEGER,0,,0


Creating a dictionary of type table_name -> DataFrame

In [39]:
# dfs = {
#    "interactions": DataFrame with columns [user_id, movie_id, rating, timestamp, ...],
#    "movies":       DataFrame with columns [movie_id, title, genres, year, ...],
#    "users":        DataFrame with columns [user_id, age, country, ...]
# }

Data frame shape where shape is the number of rows and the second number is the number of columns. We are specifically grabbing the names of the sets of tables

In [40]:
dfs = {}

for t in table_names:
    df = pd.read_sql_query(f"SELECT * FROM {t};", conn)
    dfs[t] = df
    print(f"\nLoaded table '{t}' with shape {df.shape}")


Loaded table 'data_dictionary' with shape (31, 4)

Loaded table 'movie_statistics' with shape (16015, 11)

Loaded table 'movies' with shape (18008, 3)

Loaded table 'user_statistics' with shape (438780, 10)

Loaded table 'viewer_ratings' with shape (4025000, 5)


Example of using the dfs dictionary

In [41]:
movies = dfs['movies']
movie = movies[movies['year_of_release'] == 2002]
print(movie)


       movie_id  year_of_release                                       title
20           21           2002.0                           Strange Relations
27           28           2002.0                             Lilo and Stitch
41           42           2002.0                      Searching for Paradise
50           51           2002.0  Jonah: A VeggieTales Movie: Bonus Material
51           52           2002.0                     The Weather Underground
...         ...              ...                                         ...
17865     18477           2002.0                                 First Woman
17916     20084           2002.0                               Silent Master
17941     20713           2002.0                                  My Warrior
17950     19615           2002.0                                   Dead Life
18001     19738           2002.0                                First Island

[1318 rows x 3 columns]


Giacomo thing

In [42]:
    tables = pd.read_sql(
        "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';",
        conn
    )['name'].tolist()

    print("=== DATA DICTIONARY ===\n")

    for table in tables:
        print(f"Table: {table}")
        print("-" * (7 + len(table)))

        # Get actual column info from PRAGMA but filter to nice output
        schema = pd.read_sql(f"PRAGMA table_info('{table}')", conn)

        # Keep only real schema fields you want (remove cid, default, pk if desired)
        clean_schema = schema[['name', 'type', ]]

        print(clean_schema.to_string(index=False))
        print("\n")

=== DATA DICTIONARY ===

Table: viewer_ratings
---------------------
          name    type
      movie_id INTEGER
   customer_id INTEGER
        rating    REAL
          date    TEXT
anomalous_date INTEGER


Table: movies
-------------
           name    type
       movie_id INTEGER
year_of_release    REAL
          title    TEXT


Table: user_statistics
----------------------
             name    type
      customer_id INTEGER
    total_ratings    REAL
       avg_rating    REAL
       std_rating    REAL
       min_rating    REAL
       max_rating    REAL
    unique_movies    REAL
first_rating_date    TEXT
 last_rating_date    TEXT
    activity_days    REAL


Table: movie_statistics
-----------------------
             name    type
         movie_id INTEGER
    total_ratings    REAL
       avg_rating    REAL
       std_rating    REAL
       min_rating    REAL
       max_rating    REAL
     unique_users    REAL
first_rating_date    TEXT
 last_rating_date    TEXT
  year_of_release    RE

In [43]:
for name, df in dfs.items():
    print(f"\n{name} missing values (%):")
    missing_pct = df.isna().mean() * 100
    display(missing_pct.to_frame("missing_%"))


data_dictionary missing values (%):


Unnamed: 0,missing_%
table_name,0.0
field_name,0.0
data_type,0.0
description,0.0



movie_statistics missing values (%):


Unnamed: 0,missing_%
movie_id,0.0
total_ratings,4.995317
avg_rating,4.995317
std_rating,57.246332
min_rating,4.995317
max_rating,4.995317
unique_users,4.995317
first_rating_date,0.0
last_rating_date,0.0
year_of_release,28.167343



movies missing values (%):


Unnamed: 0,missing_%
movie_id,0.0
year_of_release,0.038872
title,0.0



user_statistics missing values (%):


Unnamed: 0,missing_%
customer_id,0.0
total_ratings,5.0
avg_rating,5.0
std_rating,24.215324
min_rating,5.0
max_rating,5.0
unique_movies,5.0
first_rating_date,0.0
last_rating_date,0.0
activity_days,5.0



viewer_ratings missing values (%):


Unnamed: 0,missing_%
movie_id,0.0
customer_id,0.0
rating,10.0
date,0.0
anomalous_date,99.969963


Counting all missing values, diagnostics purposes only

A function to calculate missing std. ratings of films

In [44]:
def compute_film_std(df):
    film_stats = (
        df.groupby('movie_id')['rating']
        .apply(list)
        .reset_index(name='ratings')
    )

    def manual_std(ratings):
        ratings = np.array(ratings)
        n = len(ratings)
        if n <= 1:
            return 0.0
        mean = ratings.mean()
        return np.sqrt(((ratings - mean) ** 2).mean())

    film_stats['std_rating'] = film_stats['ratings'].apply(manual_std)

    return film_stats[['movie_id', 'std_rating']]

# Compute std for all films
viewer_ratings = dfs['viewer_ratings']
film_std = compute_film_std(viewer_ratings)

# Load movies_statistics
movies_stats = dfs["movie_statistics"]

# Compute the old percentage before merging
old_null_pct = dfs["movie_statistics"]["std_rating"].isna().mean() * 100

# Merge new std values
movies_stats = movies_stats.merge(
    film_std,
    on="movie_id",
    how="left",
    suffixes=("", "_new")
)

# Replace old std_rating with the new one
movies_stats["std_rating"] = movies_stats["std_rating_new"]
movies_stats.drop(columns=["std_rating_new"], inplace=True)

# Save updated table
dfs["movie_statistics"] = movies_stats

# Compute new percentage ---
new_null_pct = movies_stats["std_rating"].isna().mean() * 100

# absolute improvement (percentage points)
improvement_abs = old_null_pct - new_null_pct

# relative improvement (how many percent of the original NaNs we removed)
improvement_rel = (improvement_abs / old_null_pct) * 100 if old_null_pct > 0 else 0

print(f"Missing values reduced from {old_null_pct:.2f}% to {new_null_pct:.2f}%.")
print(f"Absolute improvement: {improvement_abs:.2f}%")
print(f"Relative improvement: {improvement_rel:.2f}% better than before.")

Missing values reduced from 57.25% to 13.62%.
Absolute improvement: 43.63%
Relative improvement: 76.21% better than before.


In [45]:
#Figure out how to drop na values in general
movie_stats = dfs['movie_statistics']
#print(movie_stats[movie_stats['std_rating'] == pd.isnull(movie_stats['std_rating'])])
#movie_stats = movie_stats[movie_stats['std_rating'].notna()]
#print(movie_stats)

print(f"Before cleaning: {len(movie_stats)} movies")
movie_stats = movie_stats.dropna(subset=['std_rating'])
dfs['movie_statistics'] = movie_stats
print(f"After removing single-rating movies: {len(movie_stats)} movies")

Before cleaning: 16015 movies
After removing single-rating movies: 13834 movies


Marco's

In [None]:
#MAPK here

Merging the Movies and movie statis filling in missing values on either dataset and converting all of the dates to type DateTime.
This is in order to clean our movie data before merging it with our user data.

- Converts the date parameter in viewer_ratings to datetime.
- Merges viewer_ratings, movies, movie_statistics and user_statistics into one dataset as merged_data.

In [46]:
rating_data = pd.read_sql("SELECT * FROM viewer_ratings", conn)
movie_data = pd.read_sql("SELECT * FROM movies", conn)
user_data = pd.read_sql("SELECT * FROM user_statistics", conn)
movie_statistics = pd.read_sql("SELECT * FROM movie_statistics", conn)

rating_data['date'] = pd.to_datetime(rating_data['date'], errors = 'coerce')
rating_data.dtypes
movie_statistics['first_rating_date'] = 

merged_data = rating_data.merge(movie_data, on = 'movie_id', how = 'left')
merged_data = merged_data.merge(user_data, on = 'customer_id', how = 'left')
merged_data = merged_data.merge(movie_statistics, on = 'movie_id', how = 'left')
# avg rating standard rating mean rating
'''
want to merge
- title
- year of release
want to keep independent
- avg rating
- std rating
- min rating
- max rating
- first rating date
- last rating date
'''

print("Total columns:", len(merged_data.columns))
list(merged_data.columns)

SyntaxError: invalid syntax (3309295080.py, line 8)