# Movies ratings analysis with a small dataset (100K entries)

### At first we will explore this dataset with pandas then we will compare computation performances between pandas and pyarrow

In [1]:
import pandas as pd
from pyarrow import csv, compute
import pyarrow as pa
from timeit import timeit
import time

In [17]:
movies_file = "./small_dataset/movies.csv"
ratings_file = "./small_dataset/ratings.csv"

In [18]:
movies_df = pd.read_csv(movies_file)
ratings_df = pd.read_csv(ratings_file)

### Print dataframes to see the data structure

In [19]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [20]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


## Exploration with pandas

### Question 1 : how many adventure movies are there ?

In [27]:
adventure_movies_nb = movies_df.loc[movies_df["genres"].str.contains("Adventure")].shape[0]
print(f"There are {adventure_movies_nb} movies in category Adventure")

There are 1263 movies in category Adventure


### Question 2 : How many movies by category are there ?

In [58]:
movies_by_category = {}
for movie in movies_df.iterrows():
    categories = movie[1]["genres"].split("|")
    for category in categories:
        if movies_by_category.get(category) is None:
            movies_by_category[category] = 1
        else:
            movies_by_category[category] += 1
for i in movies_by_category:
    print(f"{i}: {movies_by_category[i]}")

Adventure: 1263
Animation: 611
Children: 664
Comedy: 3756
Fantasy: 779
Romance: 1596
Drama: 4361
Action: 1828
Crime: 1199
Thriller: 1894
Horror: 978
Mystery: 573
Sci-Fi: 980
War: 382
Musical: 334
Documentary: 440
IMAX: 158
Western: 167
Film-Noir: 87
(no genres listed): 34


### Question 3 : calculate mean of ratings by user (+ standard deviation and variance)

In [70]:
user_ratings_mean = ratings_df[["userId", "rating"]].groupby(["userId"]).mean()
user_ratings_mean

Unnamed: 0_level_0,rating
userId,Unnamed: 1_level_1
1,4.366379
2,3.948276
3,2.435897
4,3.555556
5,3.636364
...,...
606,3.657399
607,3.786096
608,3.134176
609,3.270270


In [71]:
# With standard deviation and variance
user_ratings_stats = ratings_df[["userId", "rating"]].groupby(["userId"]).aggregate(["mean", "std", "var"])
user_ratings_stats

Unnamed: 0_level_0,rating,rating,rating
Unnamed: 0_level_1,mean,std,var
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,4.366379,0.800048,0.640077
2,3.948276,0.805615,0.649015
3,2.435897,2.090642,4.370783
4,3.555556,1.314204,1.727132
5,3.636364,0.990441,0.980973
...,...,...,...
606,3.657399,0.724121,0.524351
607,3.786096,0.965657,0.932494
608,3.134176,1.079262,1.164807
609,3.270270,0.450225,0.202703


### Question 4 : Calculate score of each movie

In [8]:
def timing(f):
    """timing function to calculate computation time"""
    def wrap(*args, **kwargs):
        time1 = time.time()
        ret = f(*args, **kwargs)
        time2 = time.time()
        print('{:s} function took {:.3f} ms'.format(f.__name__, (time2-time1)*1000.0))

        return ret
    return wrap