In [19]:
import pandas as pd
import numpy as np

movieInfo = pd.read_csv("movies.csv", usecols = ['movieId', 'title'], dtype = {'movieId' : 'int32', 'title' : 'str'});

In [20]:
print(movieInfo.head());

   movieId                               title
0        1                    Toy Story (1995)
1        2                      Jumanji (1995)
2        3             Grumpier Old Men (1995)
3        4            Waiting to Exhale (1995)
4        5  Father of the Bride Part II (1995)


In [22]:
movieRatings = pd.read_csv("ratings.csv", usecols = ['userId', 'movieId', 'rating'], dtype = {'userId' : 'int32', 'movieId' : 'int32', 'rating' : 'float32'});

In [12]:
print(movieRatings.head());

   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0


In [40]:
onMovIdMerged = pd.merge(movieRatings, movieInfo,on= 'movieId');

In [18]:
print(onMovIdMerged.head());

   movieId             title  userId  rating
0        1  Toy Story (1995)       1     4.0
1        1  Toy Story (1995)       5     4.0
2        1  Toy Story (1995)       7     4.5
3        1  Toy Story (1995)      15     2.5
4        1  Toy Story (1995)      17     4.5


In [44]:
cleanedDataFrame = onMovIdMerged.dropna(axis = 0, subset = ['title']);

In [45]:
print(cleanedDataFrame.head());

   userId  movieId  rating             title
0       1        1     4.0  Toy Story (1995)
1       5        1     4.0  Toy Story (1995)
2       7        1     4.5  Toy Story (1995)
3      15        1     2.5  Toy Story (1995)
4      17        1     4.5  Toy Story (1995)


In [46]:
ratingCount = (cleanedDataFrame.groupby(by = ['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})[['title', 'totalRatingCount']]);
print(ratingCount.head());

                                     title  totalRatingCount
0                               '71 (2014)                 1
1  'Hellboy': The Seeds of Creation (2004)                 1
2                   'Round Midnight (1986)                 2
3                      'Salem's Lot (2004)                 1
4                'Til There Was You (1997)                 2


In [47]:
totalRatingCount = cleanedDataFrame.merge(ratingCount, left_on = 'title', right_on = 'title', how = 'left');
print(totalRatingCount.head());

   userId  movieId  rating             title  totalRatingCount
0       1        1     4.0  Toy Story (1995)               215
1       5        1     4.0  Toy Story (1995)               215
2       7        1     4.5  Toy Story (1995)               215
3      15        1     2.5  Toy Story (1995)               215
4      17        1     4.5  Toy Story (1995)               215


In [48]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(ratingCount['totalRatingCount'].describe())

count   9719.000
mean      10.375
std       22.406
min        1.000
25%        1.000
50%        3.000
75%        9.000
max      329.000
Name: totalRatingCount, dtype: float64


In [50]:
threshold = 40
ratingPopularMovie = totalRatingCount.query('totalRatingCount >= @threshold');
print(ratingPopularMovie.head());

   userId  movieId  rating             title  totalRatingCount
0       1        1   4.000  Toy Story (1995)               215
1       5        1   4.000  Toy Story (1995)               215
2       7        1   4.500  Toy Story (1995)               215
3      15        1   2.500  Toy Story (1995)               215
4      17        1   4.500  Toy Story (1995)               215


In [54]:
ratingPopularMovie.shape;

In [55]:
print(ratingPopularMovie.shape);

(49630, 5)


In [61]:
pivotMatrix = ratingPopularMovie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
pivotMatrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,3.500
10 Things I Hate About You (1999),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,3.000,0.000,5.000,0.000,0.000,0.000,0.000,0.000
101 Dalmatians (1996),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,4.000,0.000,3.000,0.000,0.000,0.000,0.000,0.000
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
12 Angry Men (1957),0.000,0.000,0.000,5.000,0.000,0.000,0.000,0.000,0.000,0.000,...,5.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X2: X-Men United (2003),0.000,0.000,0.000,0.000,0.000,0.000,4.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,4.000,0.000,4.000
You've Got Mail (1998),0.000,0.000,0.500,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,2.000,0.000,0.000,3.500,0.000,0.000,0.000,0.000
Young Frankenstein (1974),5.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,5.000,0.000,0.000,3.500,0.000,0.000,0.000,0.000
Zombieland (2009),0.000,3.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,3.500


In [57]:
from scipy.sparse import csr_matrix

In [58]:
pivotArray = csr_matrix(pivotMatrix.values);

In [60]:
print(pivotArray);

  (0, 14)	4.0
  (0, 17)	4.0
  (0, 21)	0.5
  (0, 40)	3.5
  (0, 60)	4.5
  (0, 66)	4.0
  (0, 68)	4.0
  (0, 71)	4.0
  (0, 87)	2.5
  (0, 90)	4.0
  (0, 103)	5.0
  (0, 123)	1.0
  (0, 139)	3.0
  (0, 141)	5.0
  (0, 146)	3.5
  (0, 151)	3.5
  (0, 157)	5.0
  (0, 174)	3.5
  (0, 210)	4.5
  (0, 229)	5.0
  (0, 246)	4.0
  (0, 277)	4.0
  (0, 295)	2.0
  (0, 314)	3.0
  (0, 315)	4.0
  :	:
  (637, 378)	2.0
  (637, 384)	3.0
  (637, 410)	0.5
  (637, 411)	4.0
  (637, 416)	4.0
  (637, 417)	4.0
  (637, 421)	3.0
  (637, 423)	2.0
  (637, 435)	2.5
  (637, 445)	3.0
  (637, 477)	4.5
  (637, 480)	5.0
  (637, 481)	4.5
  (637, 486)	1.5
  (637, 492)	5.0
  (637, 522)	4.0
  (637, 531)	4.0
  (637, 539)	1.5
  (637, 557)	4.0
  (637, 558)	3.0
  (637, 570)	5.0
  (637, 587)	2.5
  (637, 597)	4.5
  (637, 605)	3.0
  (637, 607)	4.0
