In [None]:
import io
import os
import pathlib
import requests
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Downloading the data

In [None]:
ROOT_DIR = pathlib.Path("..")

DATASETS_URL = "https://github.com/KAUST-Academy/python-for-data-analysis/raw/november-2022/datasets.zip"

response = requests.get(DATASETS_URL)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall(ROOT_DIR)


In [None]:
DATASETS_DIR = ROOT_DIR / "datasets"
DATASET_DIR = DATASETS_DIR / "movielens"

## Loading the data

In [None]:
!cat $DATASET_DIR/users.dat | head

In [None]:
pd.read_table?

In [None]:
unames = ["user_id", "gender", "age", "occupation", "zip"]
users = pd.read_table(DATASET_DIR / "users.dat", sep="::",
                      header=None, names=unames, engine="python")

rnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_table(DATASET_DIR / "ratings.dat", sep="::",
                        header=None, names=rnames, engine="python")

mnames = ["movie_id", "title", "genres"]
movies = pd.read_table(DATASET_DIR / "movies.dat", sep="::",
                       header=None, names=mnames, engine="python")

In [None]:
users.info()

In [None]:
users.head(5)

In [None]:
ratings.info()

In [None]:
ratings.head(5)

In [None]:
movies.info()

In [None]:
movies.head(5)


## Combining the data

In [None]:
pd.merge?

In [None]:
data = pd.merge(pd.merge(ratings, users), movies)

In [None]:
data.info()

In [None]:
data.head()

## Exploring the data

### Average ratings by gender

In [None]:
mean_ratings = data.pivot_table(
    "rating",
    index="title",
    columns="gender",
    aggfunc="mean"
)

In [None]:
mean_ratings.head(5)

### Considering only movies with sufficient ratings

In [None]:
ratings_by_title = data.groupby("title").size()

In [None]:
ratings_by_title.head()

In [None]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [None]:
mean_ratings = mean_ratings.loc[active_titles]

In [None]:
mean_ratings.head()

### Top rated movies by gender

In [None]:
top_female_ratings = mean_ratings.sort_values("F", ascending=False)

In [None]:
top_female_ratings.head()

In [None]:
top_male_ratings = mean_ratings.sort_values("M", ascending=False)

In [None]:
top_male_ratings.head()

### Movies with largest gender diff

In [None]:
mean_ratings["diff"] = mean_ratings["M"] - mean_ratings["F"]

In [None]:
sorted_by_diff = mean_ratings.sort_values("diff")
sorted_by_diff.head()

In [None]:
sorted_by_diff.tail()

### Most divisive movies

In [None]:
rating_std_by_title = data.groupby("title")["rating"].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]

In [None]:
rating_std_by_title.head()

In [None]:
(rating_std_by_title.sort_values(ascending=False)
                    .head(n=10))

### Movie ratings by Genre

In [None]:
movies["genres"].head()

In [None]:
movies["genres"].head().str.split("|")

In [None]:
movies["genre"] = movies.pop("genres").str.split("|")

In [None]:
movies.head()

In [None]:
movies_exploded = movies.explode("genre")

In [None]:
movies_exploded.head(n=10)

In [None]:
ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users)

In [None]:
ratings_with_genre.iloc[0]

In [None]:
genre_ratings = (ratings_with_genre.loc[:, ["genre", "age", "rating"]]
                                   .groupby(["genre", "age"])
                                   .mean()
                                   .unstack("age"))

In [None]:
genre_ratings.head(n=10)