In [1]:
# Setup imports and sys.path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from src.utils import load_data

In [2]:
movies, ratings, tags, links = load_data('../data')

# Display basic info
print("Movies shape:", movies.shape)
print("Ratings shape:", ratings.shape)
print("Tags shape:", tags.shape)
print("Links shape:", links.shape)

display(movies.head())
display(ratings.head())
display(tags.head())
display(links.head())
# Display data types
print("Movies dtypes:\n", movies.dtypes)
print("Ratings dtypes:\n", ratings.dtypes)
print("Tags dtypes:\n", tags.dtypes)
print("Links dtypes:\n", links.dtypes)


Movies shape: (9742, 3)
Ratings shape: (100836, 4)
Tags shape: (3683, 4)
Links shape: (9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Movies dtypes:
 movieId     int64
title      object
genres     object
dtype: object
Ratings dtypes:
 userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object
Tags dtypes:
 userId        int64
movieId       int64
tag          object
timestamp     int64
dtype: object
Links dtypes:
 movieId      int64
imdbId       int64
tmdbId     float64
dtype: object


In [3]:
# Unique counts
n_movies = movies['movieId'].nunique()
n_users = ratings['userId'].nunique()
n_ratings = len(ratings)
n_genres = len(set(g for genres in movies['genres'].str.split(' ') for g in genres if g))
n_tags = tags['tag'].nunique() if not tags.empty else 0

print(f"Unique movies: {n_movies}")
print(f"Unique users: {n_users}")
print(f"Total ratings: {n_ratings}")
print(f"Unique genres: {n_genres}")
print(f"Unique tags: {n_tags}")

# Rating statistics
print("\nRating statistics:")
print(ratings['rating'].describe())
print(ratings['rating'].value_counts())



Unique movies: 9742
Unique users: 610
Total ratings: 100836
Unique genres: 22
Unique tags: 1589

Rating statistics:
count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64
rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64


In [4]:
# Missing values
print("Missing values in movies:")
print(movies.isnull().sum())
print("\nMissing values in ratings:")
print(ratings.isnull().sum())
print("\nMissing values in tags:")
print(tags.isnull().sum() if not tags.empty else "No tags data")

# Duplicates
print("\nDuplicate ratings (userId, movieId):")
print(ratings.duplicated(['userId', 'movieId']).sum())
print("Duplicate tags (userId, movieId, tag):")
print(tags.duplicated(['userId', 'movieId', 'tag']).sum() if not tags.empty else "No tags data")

Missing values in movies:
movieId    0
title      0
genres     0
dtype: int64

Missing values in ratings:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Missing values in tags:
userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

Duplicate ratings (userId, movieId):
0
Duplicate tags (userId, movieId, tag):
0


In [5]:
# Rating Distribution
fig_ratings = px.histogram(
    ratings,
    x='rating',
    nbins=10,
    title="Distribution of Ratings",
    labels={'rating': 'Rating', 'count': 'Frequency'},
    color_discrete_sequence=['#636EFA']
)
fig_ratings.show()

In [6]:
# Genre Frequency
genre_counts = pd.Series([
    genre for genres in movies['genres'].str.split(' ') for genre in genres if genre
]).value_counts()
fig_genres = px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    title="Genre Frequency",
    labels={'x': 'Genre', 'y': 'Count'},
    color=genre_counts.index,
    color_discrete_sequence=px.colors.qualitative.Bold
)
fig_genres.update_layout(xaxis_tickangle=45)
fig_genres.show()