# Introduction

### Project initialization and setup

Importing all of the libraries that will be used. In the project.

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Display options (make this clearer)

In [None]:

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

Explain what this is

In [None]:
DB_PATH = "viewer_interactions.db"

try:
    conn = sqlite3.connect(DB_PATH)
    print("Connected successfully!")
except sqlite3.Error as e:
    print("Connection failed:", e)

Listing all the tables

In [None]:
tables_query = """
               SELECT name
               FROM sqlite_master
               WHERE type='table'
               ORDER BY name; \
               """

tables_df = pd.read_sql_query(tables_query, conn)
print("Tables in the database:")
display(tables_df)

In [None]:
table_names = tables_df["name"].tolist()

schemas = {}

for table in table_names:
    pragma_query = f"PRAGMA table_info({table});"
    schema_df = pd.read_sql_query(pragma_query, conn)
    schemas[table] = schema_df
    print(f"\nSchema for table '{table}':")
    display(schema_df)

Creating a dictionary of type table_name -> DataFrame

In [None]:
# dfs = {
#    "interactions": DataFrame with columns [user_id, movie_id, rating, timestamp, ...],
#    "movies":       DataFrame with columns [movie_id, title, genres, year, ...],
#    "users":        DataFrame with columns [user_id, age, country, ...]
# }

Data frame shape where shape is the number of rows and the second number is the number of columns. We are specifically grabbing the names of the sets of tables

In [None]:
dfs = {}

for t in table_names:
    df = pd.read_sql_query(f"SELECT * FROM {t};", conn)
    dfs[t] = df
    print(f"\nLoaded table '{t}' with shape {df.shape}")

Example of using the dfs dictionary

In [None]:
def search_by_parameter(table_name, key, value):
    df = dfs[table_name]

    if value is None:
        return df[df[key].isna()]

    return df[df[key] == value]

Giacomo thing

In [None]:
    tables = pd.read_sql(
        "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';",
        conn
    )['name'].tolist()

    print("=== DATA DICTIONARY ===\n")

    for table in tables:
        print(f"Table: {table}")
        print("-" * (7 + len(table)))

        # Get actual column info from PRAGMA but filter to nice output
        schema = pd.read_sql(f"PRAGMA table_info('{table}')", conn)

        # Keep only real schema fields you want (remove cid, default, pk if desired)
        clean_schema = schema[['name', 'type', ]]

        print(clean_schema.to_string(index=False))
        print("\n")

In [None]:
for name, df in dfs.items():
    print(f"\n{name} missing values (%):")
    missing_pct = df.isna().mean() * 100
    display(missing_pct.to_frame("missing_%"))

Counting all missing values, diagnostics purposes only

A function to calculate missing std. ratings of films

In [None]:
def compute_film_std(df):
    film_stats = (
        df.groupby('movie_id')['rating']
        .apply(list)
        .reset_index(name='ratings')
    )

    def manual_std(ratings):
        ratings = np.array(ratings)
        n = len(ratings)
        if n <= 1:
            return 0.0
        mean = ratings.mean()
        return np.sqrt(((ratings - mean) ** 2).mean())

    film_stats['std_rating'] = film_stats['ratings'].apply(manual_std)

    return film_stats[['movie_id', 'std_rating']]

# Compute std for all films
viewer_ratings = dfs['viewer_ratings']
film_std = compute_film_std(viewer_ratings)

# Load movies_statistics
movies_stats = dfs["movie_statistics"]

# Compute the old percentage before merging
old_null_pct = dfs["movie_statistics"]["std_rating"].isna().mean() * 100

# Merge new std values
movies_stats = movies_stats.merge(
    film_std,
    on="movie_id",
    how="left",
    suffixes=("", "_new")
)

# Replace old std_rating with the new one
movies_stats["std_rating"] = movies_stats["std_rating_new"]
movies_stats.drop(columns=["std_rating_new"], inplace=True)

# Save updated table
dfs["movie_statistics"] = movies_stats

# Compute new percentage ---
new_null_pct = movies_stats["std_rating"].isna().mean() * 100

# absolute improvement (percentage points)
improvement_abs = old_null_pct - new_null_pct

# relative improvement (how many percent of the original NaNs we removed)
improvement_rel = (improvement_abs / old_null_pct) * 100 if old_null_pct > 0 else 0

print(f"Missing values reduced from {old_null_pct:.2f}% to {new_null_pct:.2f}%.")
print(f"Absolute improvement: {improvement_abs:.2f}%")
print(f"Relative improvement: {improvement_rel:.2f}% better than before.")

In [None]:
#Figure out how to drop na values in general
movie_stats = dfs['movie_statistics']
#print(movie_stats[movie_stats['std_rating'] == pd.isnull(movie_stats['std_rating'])])
#movie_stats = movie_stats[movie_stats['std_rating'].notna()]
#print(movie_stats)

print(f"Before cleaning: {len(movie_stats)} movies")
movie_stats = movie_stats.dropna(subset=['std_rating'])
dfs['movie_statistics'] = movie_stats
print(f"After removing single-rating movies: {len(movie_stats)} movies")

Calculating the missing total_ratings for movies

In [None]:
movie_stats = dfs["movie_statistics"]
viewer_ratings = dfs["viewer_ratings"]

# Collecting all the movies with absent total_rating in a dictionary
missing_dict = {}

missing = search_by_parameter('movie_statistics', 'total_ratings', None)
missing_dict = {row.movie_id: 0 for row in missing.itertuples(index=False)}

# Iterating through viewer_ratings and manually counting the ratings for each film
for row in viewer_ratings.itertuples(index=False):
    movie_id = row.movie_id
    if movie_id in missing_dict:
        missing_dict[movie_id] += 1

# Update movie_stats
for row in movie_stats.itertuples(index=True):
    if row.movie_id in missing_dict:
        movie_stats.at[row.Index, "total_ratings"] = missing_dict[row.movie_id]

dfs["movie_statistics"] = movie_stats

In [None]:
movie_stats = dfs["movie_statistics"]
viewer_ratings = dfs["viewer_ratings"]

# Find movies with missing min_rating and max_rating using your function
missing_min = search_by_parameter('movie_statistics', 'min_rating', None)
missing_max = search_by_parameter('movie_statistics', 'max_rating', None)

# Combine them, because some movies may be in both
missing_ids = set(missing_min["movie_id"]) | set(missing_max["movie_id"])

# Finding relevant movie ratings in viewer_ratings table
relevant_ratings = viewer_ratings[viewer_ratings["movie_id"].isin(missing_ids)]

min_max_dict = {}

for row in relevant_ratings.itertuples(index=False):
    movie_id = row.movie_id
    rating = row.rating

    if movie_id not in min_max_dict:
        min_max_dict[movie_id] = {"min": rating, "max": rating}
    else:
        if rating < min_max_dict[movie_id]["min"]:
            min_max_dict[movie_id]["min"] = rating
        if rating > min_max_dict[movie_id]["max"]:
            min_max_dict[movie_id]["max"] = rating

# Updating
for row in movie_stats.itertuples(index=True):
    movie_id = row.movie_id

    if movie_id in min_max_dict:
        if pd.isna(row.min_rating):
            movie_stats.at[row.Index, "min_rating"] = min_max_dict[movie_id]["min"]
        if pd.isna(row.max_rating):
            movie_stats.at[row.Index, "max_rating"] = min_max_dict[movie_id]["max"]

dfs["movie_statistics"] = movie_stats

Calculating missing mix and max ratings for films


Merging the Movies and movie statis filling in missing values on either dataset and converting all of the dates to type DateTime.
This is in order to clean our movie data before merging it with our user data.

- Converts the date parameter in viewer_ratings to datetime.
- Merges viewer_ratings, movies, movie_statistics and user_statistics into one dataset as merged_data.

In [None]:
rating_data = pd.read_sql("SELECT * FROM viewer_ratings", conn)
movie_data = pd.read_sql("SELECT * FROM movies", conn)
user_data = pd.read_sql("SELECT * FROM user_statistics", conn)
movie_statistics = pd.read_sql("SELECT * FROM movie_statistics", conn)

rating_data['date'] = pd.to_datetime(rating_data['date'], errors = 'coerce')
rating_data.dtypes
movie_statistics['first_rating_date'] = 

merged_data = rating_data.merge(movie_data, on = 'movie_id', how = 'left')
merged_data = merged_data.merge(user_data, on = 'customer_id', how = 'left')
merged_data = merged_data.merge(movie_statistics, on = 'movie_id', how = 'left')
# avg rating standard rating mean rating
'''
want to merge
- title
- year of release
want to keep independent
- avg rating
- std rating
- min rating
- max rating
- first rating date
- last rating date
'''

print("Total columns:", len(merged_data.columns))
list(merged_data.columns)