## Step 1: Import all Dependencies

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Step 2: Load in Data Sets

In [44]:
ratings = pd.read_csv('./data/ratings.csv')

In [45]:
movies = pd.read_csv('./data/movies.csv')

In [46]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [47]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Step 3: Exploratory Data Analysis

In [48]:
n_ratings = len(ratings)
n_movies = len(movies)
n_users = ratings['userId'].nunique()

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average number of ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average number of ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 100836
Number of unique movieId's: 9742
Number of unique users: 610
Average number of ratings per user: 165.3
Average number of ratings per movie: 10.35


This data is not entierly realisitic because the data is usually more sparce. 165.3 ratings per user is very dense.

In [49]:
mean_global_rating = round(ratings['rating'].mean(), 2)
print(f"Mean global rating:", mean_global_rating)

Mean global rating: 3.5


In [50]:
mean_rating_per_user = ratings.groupby('userId')['rating'].mean()
print(f"Mean rating per user:", round(mean_rating_per_user.mean(), 2))
#average of the average rating given by each user.

Mean rating per user: 3.66


### Which movies are most frequently rated?




In [51]:
ratings['movieId'].value_counts()

movieId
356       329
318       317
296       307
593       279
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: count, Length: 9724, dtype: int64

Not that informative since we dont know what move '359' corresponds to. Hence we can merge the ratings and movie data frames.

In [52]:
movie_ratings = ratings.merge(movies, on='movieId')


In [53]:
movie_ratings['title'].value_counts()[0:10]

title
Forrest Gump (1994)                          329
Shawshank Redemption, The (1994)             317
Pulp Fiction (1994)                          307
Silence of the Lambs, The (1991)             279
Matrix, The (1999)                           278
Star Wars: Episode IV - A New Hope (1977)    251
Jurassic Park (1993)                         238
Braveheart (1995)                            237
Terminator 2: Judgment Day (1991)            224
Schindler's List (1993)                      220
Name: count, dtype: int64

In [54]:
mean_ratings = ratings.groupby('movieId')[['rating']].mean()
mean_ratings

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.920930
2,3.431818
3,3.259615
4,2.357143
5,3.071429
...,...
193581,4.000000
193583,3.500000
193585,3.500000
193587,3.500000


In [63]:
#highest rated movie
mean_ratings_per_movie = movie_ratings.groupby('movieId', as_index=False)['rating'].mean()
mean_ratings_per_movie

Unnamed: 0,movieId,rating
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
9719,193581,4.000000
9720,193583,3.500000
9721,193585,3.500000
9722,193587,3.500000


In [None]:
highest_rated = mean_ratings_per_movie['rating'].idxmax()
movies[movies['movieId'] == highest_rated]

Unnamed: 0,movieId,title,genres
44,48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance


In [67]:
lowest_rated = mean_ratings_per_movie['rating'].idxmin()
movies[movies['movieId'] == lowest_rated]

Unnamed: 0,movieId,title,genres
