# MovieLens Dataset example
https://grouplens.org/datasets/movielens/

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import sys
sys.path.append("/home/panderson/rankability_toolbox")

In [5]:
import pandas as pd
import numpy as np
import dill

In [290]:
import pyrankability

In [30]:
DATA_DIR='ml-latest-small'

## Read and preprocess the data

In [14]:
links = pd.read_csv("%s/links.csv"%DATA_DIR)
display(links.head())
links["tmdbId"] = links["tmdbId"].fillna(-1).astype(int)
display(links.head())
display(links.dtypes)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


movieId    int64
imdbId     int64
tmdbId     int64
dtype: object

In [209]:
movies = pd.read_csv("%s/movies.csv"%DATA_DIR)
display(movies.head())
movies["genres"] = movies["genres"].str.split("|")
display(movies.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [210]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

movies = movies.join(pd.DataFrame(mlb.fit_transform(movies['genres']),
                          columns=mlb.classes_,
                          index=movies.index))


In [211]:
movies

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,6,Heat (1995),"[Action, Crime, Thriller]",0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
6,7,Sabrina (1995),"[Comedy, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck (1995),"[Adventure, Children]",0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death (1995),[Action],0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]",0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [24]:
ratings = pd.read_csv("%s/ratings.csv"%DATA_DIR)
display(ratings.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Exploratory

**Number of unique movies along with their counts**

In [29]:
ratings.set_index("movieId").join(movies.set_index("movieId"))["title"].value_counts()

Forrest Gump (1994)                                                               329
Shawshank Redemption, The (1994)                                                  317
Pulp Fiction (1994)                                                               307
Silence of the Lambs, The (1991)                                                  279
Matrix, The (1999)                                                                278
Star Wars: Episode IV - A New Hope (1977)                                         251
Jurassic Park (1993)                                                              238
Braveheart (1995)                                                                 237
Terminator 2: Judgment Day (1991)                                                 224
Schindler's List (1993)                                                           220
Fight Club (1999)                                                                 218
Toy Story (1995)                                      

**What if we say 50 people in total had to rank a movie to even start considering it?**

In [31]:
counts = ratings.set_index("movieId").join(movies.set_index("movieId"))["title"].value_counts()
counts[counts > 50]

Forrest Gump (1994)                                                               329
Shawshank Redemption, The (1994)                                                  317
Pulp Fiction (1994)                                                               307
Silence of the Lambs, The (1991)                                                  279
Matrix, The (1999)                                                                278
Star Wars: Episode IV - A New Hope (1977)                                         251
Jurassic Park (1993)                                                              238
Braveheart (1995)                                                                 237
Terminator 2: Judgment Day (1991)                                                 224
Schindler's List (1993)                                                           220
Fight Club (1999)                                                                 218
Toy Story (1995)                                      

**This gets us to 437 movies, so that's a decent D matrix I think.**

For a start, let's fill in values in D by finding users who ranked two movies and then storing the difference in rating.

In [74]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [None]:
from itertools import combinations
flatten = lambda l: [item for sublist in l for item in sublist]
df1 = ratings.groupby("userId").apply(lambda df: 
                                     pd.DataFrame([flatten(tup) for tup in list(combinations(df.values,2))],columns=[v+"_i"for v in df.columns]+[v+"_j"for v in df.columns],dtype=int).
                                     set_index(["movieId_i","movieId_j"]))

df2 = df1.apply(lambda x: x["rating_i"]-x["rating_j"],axis=1).unstack()
display(df2)
#fix negatives when square
#inxs = np.where(df2 < 0)
#print("Fix negative numbers",len(inxs[0]))
#df2.values[inxs[1],inxs[0]] = -df2.values[inxs[0],inxs[1]]
#df2.values[inxs[0],inxs[1]] = 0
#display(df2.groupby().apply(lambda x: x.unstack()))
#inxs = np.where(df2 < 0)
#print("After fix negative numbers",len(inxs[0]))



In [None]:
means = df2.stack().groupby(["movieId_i","movieId_j"]).mean()

In [None]:
counts = df2.stack().groupby(["movieId_i","movieId_j"]).count()

**A quick glance at our counts and means**

In [189]:
display(counts.head())

movieId_i  movieId_j
1          2            68
           3            32
           4             2
           5            32
           6            58
dtype: int64

In [191]:
display(counts.head())

movieId_i  movieId_j
1          2            68
           3            32
           4             2
           5            32
           6            58
dtype: int64

## Sparse format to D
We now have a parse format, but we need to turn this into a D matrix. Let's only use a movie if there exists a paired count greater than 10.

In [197]:
count_mask = counts > 10
print(len(np.unique(list(counts[count_mask].unstack().index) + list(counts[count_mask].unstack().columns))))

2021


This means we would have a D matrix that is size 2021 by 2021. A little larger than our target of 500 x 500. But this is still small enough that we can construct the total matrix.

In [246]:
D_counts = counts[count_mask].unstack()
D_means = means[count_mask].unstack()

In [247]:
D_counts.stack()

movieId_i  movieId_j
1          2             68.0
           3             32.0
           5             32.0
           6             58.0
           7             32.0
           9             12.0
           10            69.0
           11            42.0
           12            15.0
           14            12.0
           16            41.0
           17            44.0
           18            12.0
           19            55.0
           21            47.0
           22            22.0
           23            11.0
           24            18.0
           25            42.0
           29            19.0
           31            17.0
           32           104.0
           34            84.0
           36            37.0
           39            59.0
           44            28.0
           45            17.0
           47            99.0
           48            42.0
           50            96.0
                        ...  
134130     152077        13.0
134853     138036  

We need to label the data so we can search for specific genres

In [248]:
D_counts_labelled = D_counts.join(movies.set_index("movieId")).transpose().join(movies.set_index("movieId"))
D_means_labelled = D_means.join(movies.set_index("movieId")).transpose().join(movies.set_index("movieId"))

**Break the data into genres**

In [249]:
D_counts_labelled

Unnamed: 0,1,2,3,5,6,7,9,10,11,12,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
2,68,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,32,26,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,32,22,19,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,58,36,25,24,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,32,18,19,23,22,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,12,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,69,56,23,20,49,22,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11,42,29,13,16,26,24,,36,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12,15,,,,,,,,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,12,,,,14,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [340]:
genres = ["Comedy","Romance"]
genre = genres[1]

In [341]:
from scipy.sparse import csr_matrix

i_mask = D_counts_labelled.loc[genre] == 1
j_mask = D_counts_labelled.loc[:,genre] == 1
D_means_genre = D_means.loc[i_mask,j_mask]
D_counts_genre = D_counts.loc[i_mask,j_mask]
inxs = np.where(~D_means_genre.isna())
n = max([max(inxs[0]),max(inxs[1])])+1
row = np.ix_(inxs[0],inxs[1])[0].reshape(1,-1)[0]
col = np.ix_(inxs[0],inxs[1])[1].reshape(1,-1)[0]
D = csr_matrix((D_means_genre.values[inxs].flatten(), (row,col)), shape=(n, n)).todense()
# Now fix negative numbers
inxs = np.where(D < 0)
print("Fix negative numbers",len(inxs[0]))
D[inxs[1],inxs[0]] = -D[inxs[0],inxs[1]]
D[inxs[0],inxs[1]] = 0
print("Just a subset of the matrix")
display(D[:10,:10])
inxs = np.where(D < 0)
print("After fix negative numbers",len(inxs[0]))
print("Summary of number of nonzero")
print(D.shape,(D>0).sum())

Fix negative numbers 5838
Just a subset of the matrix


matrix([[0.        , 0.21052632, 0.        , 0.        , 0.        ,
         0.        , 0.28571429, 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.15384615, 0.54166667, 0.        , 0.        , 0.08333333,
         0.36842105, 0.30769231, 0.07692308, 0.        , 0.        ],
        [0.23076923, 0.56      , 0.30769231, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.25      , 0.45454545, 0.        , 0.        , 0.44827586,
         0.44444444, 0.5       , 0.38461538, 0.15384615, 0.        ],
        [0.25      , 0.45454545, 0.        , 0.        , 0.        ,
         0.17241379, 0.        , 0.47058824, 0.0625    , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.37037037, 0.        , 0.        , 0.        ],
        [0.        , 0.6   

After fix negative numbers 0
Summary of number of nonzero
(372, 372) 11122


In [None]:
k,details = pyrankability.hillside.count_lp(D)
print(k,details["P"])

In [None]:
print(k,details["P"]) 

In [334]:
pyrankability.hillside.objective_count_exhaustive(D[:8,:8])

(53,
 {'P': [(6, 2, 1, 4, 5, 7, 3, 0),
   (6, 2, 1, 4, 7, 3, 5, 0),
   (6, 2, 1, 4, 7, 5, 3, 0),
   (6, 2, 1, 5, 4, 7, 3, 0)]})

In [None]:
1e20

In [None]:
print('test')