In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import csr_matrix, save_npz, load_npz, find

In [2]:
!ls -litah dataset/

total 5.1G
16516401 drwxr-xr-x 5 mayukhpay mayukhpay 4.0K Dec 30 14:44 ..
16516646 -rw-r--r-- 1 mayukhpay mayukhpay 105M Dec 30 13:40 test_25000_3000.csv
16516621 drwxr-xr-x 2 mayukhpay mayukhpay 4.0K Dec 30 13:39 .
16516645 -rw-r--r-- 1 mayukhpay mayukhpay  15M Dec 30 08:38 test_10000_1000.csv
16516644 -rw-r--r-- 1 mayukhpay mayukhpay  44M Dec 30 07:43 sample_sparse_test_matrix.npz
16516643 -rw-r--r-- 1 mayukhpay mayukhpay 556K Dec 30 07:39 sample_sparse_test_matrix_25000_3000.npz
16516635 -rw-r--r-- 1 mayukhpay mayukhpay  92K Dec 30 07:38 sample_sparse_test_matrix_10000_1000.npz
16516636 -rw-r--r-- 1 mayukhpay mayukhpay  92M Dec 30 04:26 train_25000_3000.csv
16516641 -rw-r--r-- 1 mayukhpay mayukhpay 1.6M Dec 29 14:07 sample_sparse_matrix_25000_3000.npz
16516614 -rw-r--r-- 1 mayukhpay mayukhpay  14M Dec 29 13:23 train_10000_1000.csv
16516634 -rw-r--r-- 1 mayukhpay mayukhpay 160M Dec 28 08:40 sample_sparse_matrix.npz
16516633 -rw-r--r-- 1 mayukhpay mayukhpay  18M Dec 28 08:

In [3]:
!head ./dataset/combined_data_1.txt

1:
1488844,3,2005-09-06
822109,5,2005-05-13
885013,4,2005-10-19
30878,4,2005-12-26
823519,3,2004-05-03
893988,3,2005-11-17
124105,4,2004-08-05
1248029,3,2004-04-22
1842128,4,2004-05-09


### Processing the data into CSV

In [35]:
# Total runtime 2m 56s

with open('reviews.csv', 'w') as csvFile:
    for i in tqdm(range(1, 5), total=4):
        with open('./dataset/combined_data_' + str(i) + '.txt', 'r') as fin:
            for line in fin:
                line = line.strip()
                if line[-1] == ':':
                    movie_id = line.split(':')[0]
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    csvFile.write(','.join(row))
                    csvFile.write('\n')

100%|██████████| 4/4 [02:56<00:00, 44.07s/it]


In [4]:
reviews = pd.read_csv('reviews.csv', names=['movie_id', 'user_id', 'rating', 'time'])

In [5]:
reviews.head()

Unnamed: 0,movie_id,user_id,rating,time
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [6]:
reviews.shape

(100480507, 4)

In [7]:
reviews.rating.describecribe()

count    1.004805e+08
mean     3.604290e+00
std      1.085219e+00
min      1.000000e+00
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [8]:
reviews.isna().any()

movie_id    False
user_id     False
rating      False
time        False
dtype: bool

In [9]:
reviews.isnull().any()

movie_id    False
user_id     False
rating      False
time        False
dtype: bool

In [10]:
duplicates = reviews.duplicated(['movie_id', 'user_id', 'rating'])
print(sum(duplicates))

0


In [12]:
print('Number of reviews by users: {}'.format(reviews.shape[0]))
print('Number of unique users: {}'.format(reviews.user_id.unique().shape[0]))
print('Number of unique movies: {}'.format(reviews.movie_id.unique().shape[0]))

Number of reviews by users: 100480507
Number of unique users: 480189
Number of unique movies: 17770


### Temporal Splitting of data in train-test

In [13]:
reviews.sort_values('time', ascending=True, inplace=True)

In [14]:
reviews.head()

Unnamed: 0,movie_id,user_id,rating,time
56431994,10341,510180,4,1999-11-11
9056171,1798,510180,5,1999-11-11
58698779,10774,510180,3,1999-11-11
48101611,8651,510180,2,1999-11-11
81893208,14660,510180,2,1999-11-11


In [15]:
reviews_train = reviews.iloc[:int(reviews.shape[0] * 0.80)]
reviews_test = reviews.iloc[int(reviews.shape[0]*0.80):]

In [16]:
reviews_train.shape

(80384405, 4)

In [17]:
reviews_test.shape

(20096102, 4)

In [18]:
reviews_train.to_csv('./dataset/train.csv', index=False)
reviews_test.to_csv('./dataset/test.csv', index=False)

In [21]:
reviews_train.drop('time', inplace=True, axis=1)
reviews_test.drop('time', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [29]:
reviews_train_sparse = csr_matrix((reviews_train.rating.values, (reviews_train.user_id.values, 
                                                                 reviews_train.movie_id.values)),)
reviews_test_sparse = csr_matrix((reviews_test.rating.values, (reviews_test.user_id.values, 
                                                                 reviews_test.movie_id.values)),)

In [32]:
save_npz('./dataset/reviews_train_sparse.npz', reviews_train_sparse)
save_npz('./dataset/reviews_test_sparse.npz', reviews_test_sparse)

In [30]:
us, mv = reviews_train_sparse.shape
elem = reviews_train_sparse.count_nonzero()
print("Sparsity Of Test matrix : {} % ".format((1-(elem/(us*mv))) * 100))

Sparsity Of Test matrix : 99.8292709259195 % 


In [31]:
us, mv = reviews_test_sparse.shape
elem = reviews_test_sparse.count_nonzero()
print("Sparsity Of Test matrix : {} % ".format((1-(elem/(us*mv))) * 100))

Sparsity Of Test matrix : 99.95731772988694 % 


### Loading the dataset

In [6]:
reviews_train_sparse = load_npz('./dataset/reviews_train_sparse.npz')
reviews_test_sparse = load_npz('./dataset/reviews_test_sparse.npz')

In [7]:
reviews_train_sparse.get_shape()

(2649430, 17771)

In [8]:
reviews_test_sparse.get_shape()

(2649430, 17771)

### HandCrafted Features

In [4]:
# Code Courtesy : AAIC

def get_average_ratings(sparse_matrix, of_users):
    # average ratings of user/axes
    ax = 1 if of_users else 0 # 1 - User axes,0 - Movie axes
    # ".A1" is for converting Column_Matrix to 1-D numpy array 
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    # Boolean matrix of ratings ( whether a user rated that movie or not)
    is_rated = sparse_matrix!=0
    # no of ratings that each user OR movie..
    no_of_ratings = is_rated.sum(axis=ax).A1
    # max_user  and max_movie ids in sparse matrix 
    u,m = sparse_matrix.shape
    # creae a dictonary of users and their average ratigns..
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i]
                                 for i in range(u if of_users else m) 
                                    if no_of_ratings[i] !=0}

    # return that dictionary of average ratings
    return average_ratings

In [5]:
# Code Courtesy : Patially taken from AAIC & modified for better felxibility

def get_sample_sparse_matrix(sparse_matrix, nusers=-1, nmovies=-1, 
                             verbose=True, save=True, **kwargs):
    """
        This would create a sample of the `sparse_matrix` consisting of
        `nmovies` and `nusers` and save it to disk if `save` is True. If `save`
        is kept as default and file name is not mentioned, it stores the file with
        a default name and number of user and the movies appened with it.
        
        :param sparse_matrix: The sparse matrix that is passed <scipy.sparse.matrix>
        
        :param nusers: (default = -1) Number of users to consider. If
                        kept default, all the users are considered <int>

        :param nmovies: (default = - 1) Number of movies to consider. If
                        kept default, all the movies are considered. <int>

        :param verbose: (default = True) Verbose output <bool>
        
        :param save: (default = True) Save to the disk <bool>
    """

    row_index, col_index, ratings = find(sparse_matrix)
    users = np.unique(row_index)
    movies = np.unique(col_index)
    
    if nmovies == -1 and nusers == -1:
        print('No sampling is being performed')
    
    if verbose:
        print("Original Matrix : (users, movies) -- ({} {})".format(len(users), len(movies)))
        print("Original Matrix : Ratings -- {}\n".format(len(ratings)))

    np.random.seed(15)
    if nusers != -1:
        sample_users = np.random.choice(users, nusers, replace=False)
    else:
        sample_users = users

    if nmovies != -1:
        sample_movies = np.random.choice(movies, nmovies, replace=False)
    else:
        sample_movies = movies

    mask = np.logical_and( np.isin(row_index, sample_users),
                      np.isin(col_index, sample_movies) )
    
    sample_sparse_matrix = csr_matrix((ratings[mask], (row_index[mask], col_index[mask])),
                                             shape=(max(sample_users)+1, max(sample_movies)+1))

    if verbose:
        print("Sampled Matrix : (users, movies) -- ({} {})".format(len(sample_users), len(sample_movies)))
        print("Sampled Matrix : Ratings --", format(ratings[mask].shape[0]))
    
    if save:
        if verbose:
            print('Saving it into disk for further usage..')
        path = kwargs.get('path', './dataset/')
        file = kwargs.get('file', 'sample_sparse_matrix')
        u = '_' + str(nusers)
        if nusers == -1:
            u = ''
        m = '_' + str(nmovies)
        if nmovies == -1:
            m = ''
        
        file += u + m
        save_npz(path + file, sample_sparse_matrix)
    else:
        return sample_sparse_matrix

In [46]:
%time get_sample_sparse_matrix(reviews_train_sparse, nusers=10000, nmovies=1000)

Original Matrix : (users, movies) -- (405041 17424)
Original Matrix : Ratings -- 80384405

Sampled Matrix : (users, movies) -- (10000 1000)
Sampled Matrix : Ratings -- 129286
Saving it into disk for further usage..
CPU times: user 1min 19s, sys: 8.24 s, total: 1min 27s
Wall time: 1min 27s


In [10]:
%%time 
get_sample_sparse_matrix(reviews_test_sparse, nusers=10000, nmovies=1000, 
                         file='sample_sparse_test_matrix')

Original Matrix : (users, movies) -- (349312 17757)
Original Matrix : Ratings -- 20096102

Sampled Matrix : (users, movies) -- (10000 1000)
Sampled Matrix : Ratings -- 36017
Saving it into disk for further usage..
CPU times: user 14.6 s, sys: 1.86 s, total: 16.4 s
Wall time: 16.5 s


In [48]:
!ls ./dataset/sample_sparse_matrix_10000_1000.npz  # Dataset of 10k users and 1k movies

./dataset/sample_sparse_matrix_10000_1000.npz


In [11]:
!ls ./dataset/sample_sparse_test_matrix_10000_1000.npz

./dataset/sample_sparse_test_matrix_10000_1000.npz


In [144]:
%time get_sample_sparse_matrix(reviews_train_sparse, nusers=25000, nmovies=3000)

Original Matrix : (users, movies) -- (405041 17424)
Original Matrix : Ratings -- 80384405

Sampled Matrix : (users, movies) -- (25000 3000)
Sampled Matrix : Ratings -- 856986
Saving it into disk for further usage..
CPU times: user 1min 16s, sys: 7.86 s, total: 1min 24s
Wall time: 1min 24s


In [12]:
%%time
get_sample_sparse_matrix(reviews_test_sparse, nusers=25000, nmovies=3000,
                              file='sample_sparse_test_matrix')

Original Matrix : (users, movies) -- (349312 17757)
Original Matrix : Ratings -- 20096102

Sampled Matrix : (users, movies) -- (25000 3000)
Sampled Matrix : Ratings -- 261693
Saving it into disk for further usage..
CPU times: user 15 s, sys: 1.78 s, total: 16.8 s
Wall time: 16.8 s


In [145]:
!ls ./dataset/sample_sparse_matrix_25000_3000.npz # Dataset of 25k users and 3k movies

./dataset/sample_sparse_matrix_25000_3000.npz


In [13]:
!ls ./dataset/sample_sparse_test_matrix_25000_3000.npz

./dataset/sample_sparse_test_matrix_25000_3000.npz


In [51]:
%time get_sample_sparse_matrix(reviews_train_sparse)

No sampling is being performed
Original Matrix : (users, movies) -- (405041 17424)
Original Matrix : Ratings -- 80384405

Sampled Matrix : (users, movies) -- (405041 17424)
Sampled Matrix : Ratings -- 80384405
Saving it into disk for further usage..
CPU times: user 2min 21s, sys: 11.7 s, total: 2min 32s
Wall time: 2min 38s


In [53]:
!ls ./dataset/sample_sparse_matrix.npz # The complete dataset

./dataset/sample_sparse_matrix.npz


In [15]:
%%time 
get_sample_sparse_matrix(reviews_test_sparse, file='sample_sparse_test_matrix')

No sampling is being performed
Original Matrix : (users, movies) -- (349312 17757)
Original Matrix : Ratings -- 20096102

Sampled Matrix : (users, movies) -- (349312 17757)
Sampled Matrix : Ratings -- 20096102
Saving it into disk for further usage..
CPU times: user 33.9 s, sys: 2.81 s, total: 36.7 s
Wall time: 37 s


In [16]:
!ls ./dataset/sample_sparse_test_matrix.npz

./dataset/sample_sparse_test_matrix.npz


### Create the dataset on different samples

- Add Global Average                                 **1 dimension**
- Add the User Average                               **1 dimension**
- Add the movie Average                              **1 dimension**
- Add the top 10 similar movies                      **10 dimension**
- Add the top 10 similar users                       **10 dimension**
- Add other columns like movie id, user id, rating   **3/4 dimension**

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
import csv
from tqdm import tqdm
from multiprocessing import Manager, Process

#### On the training dataset of dimension (10000 x 1000)

In [28]:
sample_train_sparse_matrix = load_npz('./dataset/sample_sparse_matrix_10000_1000.npz')

In [29]:
sample_train_sparse_matrix.shape

(2649405, 17724)

In [111]:
sample_train_users, sample_train_movies, sample_train_ratings = find(sample_train_sparse_matrix)

In [112]:
sample_train_users.shape

(129286,)

In [30]:
userAvg = get_average_ratings(sample_train_sparse_matrix, of_users=True)
userAvg[704512]

4.0

In [31]:
movieAvg = get_average_ratings(sample_train_sparse_matrix, of_users=False)
movieAvg[33]

4.092436974789916

In [32]:
GAvg = sample_train_sparse_matrix.sum()/sample_train_sparse_matrix.count_nonzero()

In [46]:
len(sample_train_ratings)

129286

In [76]:
18 * 7183

129294

In [88]:
# U = User Vector
# M = Movie Vector
# R = Rating Vector
# with open('./dataset/train_10000_1000.csv', 'w') as csvFile:
#     csvwriter = csv.writer(csvFile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

# Code Runtime :- Almost 1 hours
def get_train_data(worker, all_users, all_movies, all_ratings, trainsets):

    for user, movie, rating in zip(all_users, 
                                    all_movies, 
                                    all_ratings):
        
#         print(user, movie, rating)
        ## Get the ratings from top 10 similar users who rated the same movie ##
        # Movie constant, users changing
        row = []
        U = sample_train_sparse_matrix[user] # movie vector the user
        similar_users = cosine_similarity(U, sample_train_sparse_matrix).ravel()
        # These are top users for the `user`
        top_users_sorted = np.argsort(-similar_users)[1::]
        # Now get the ratings for these top users for the `movie`
        R_user = sample_train_sparse_matrix[top_users_sorted, movie].toarray().ravel()
        R_user = R_user[R_user != 0][:10]
        R_user = R_user.tolist()
        R_user.extend([movieAvg[movie]] * (10 - len(R_user)))
        ## Get the top ratings of the user who rated similar movies ##
        # User constant, movies changing

        M = sample_train_sparse_matrix[:, movie]
        similar_movies = cosine_similarity(M.T, sample_train_sparse_matrix.T).ravel()
        top_movies_sorted = np.argsort(-similar_movies)[1::]
        R_movie = sample_train_sparse_matrix[user, top_movies_sorted].toarray().ravel()
        R_movie = R_movie[R_movie != 0][:10]
        R_movie = R_movie.tolist()
        R_movie.extend([userAvg[user]] * (10 - len(R_movie)))


        row.extend([user, movie])
        row.extend(R_user)
        row.extend(R_movie)
        row.extend([movieAvg[movie], userAvg[user], GAvg])
        row.extend([rating])

        trainsets.append(row)
    print('Worker = {} is completed'.format(worker))

if __name__ == '__main__':
    manager = Manager()
    trainsets = manager.list()
    jobs = []
    for i in range(18):
        p = Process(target=get_train_data, args=(i, 
                                                 sample_train_users[i*7183:(i+1)*7183],
                                                 sample_train_movies[i*7183:(i+1)*7183],
                                                 sample_train_ratings[i*7183:(i+1)*7183],
                                                 trainsets,))
        jobs.append(p)
        p.start()
    for proc in jobs:
        proc.join()

Worker = 3 is completed
Worker = 10 is completed
Worker = 6 is completed
Worker = 8 is completed
Worker = 14 is completed
Worker = 16 is completed
Worker = 13 is completed
Worker = 12 is completed
Worker = 9 is completed
Worker = 0 is completed
Worker = 2 is completed
Worker = 17 is completed
Worker = 7 is completed
Worker = 5 is completed
Worker = 11 is completed
Worker = 15 is completed
Worker = 4 is completed
Worker = 1 is completed


In [89]:
assert(len(trainsets) == len(sample_train_users))

In [None]:
with open('./dataset/train_10000_1000.csv', 'w') as csvFile:
    csvwriter = csv.writer(csvFile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for each in tqdm(list(trainsets), total=len(sample_train_users)):
        csvwriter.writerow(each)

In [93]:
!wc -l ./dataset/train_10000_1000.csv

129286 ./dataset/train_10000_1000.csv


#### On the testing dataset of dimension (10000 x 1000)

In [36]:
sample_test_sparse_matrix = load_npz('./dataset/sample_sparse_test_matrix_10000_1000.npz')

In [37]:
sample_test_sparse_matrix.shape

(2649405, 17760)

In [38]:
sample_test_users, sample_test_movies, sample_test_ratings = find(sample_test_sparse_matrix)

In [39]:
len(sample_test_users)

36017

In [51]:
18 * 2001

36018

In [53]:
%%time
def get_test_data(worker, all_users, all_movies, all_ratings, testsets):

    for user, movie, rating in zip(all_users, 
                                    all_movies, 
                                    all_ratings):
        
        ## Get the ratings from top 10 similar users who rated the same movie ##
        # Movie constant, users changing
        row = []
        try:
            U = sample_train_sparse_matrix[user] # movie vector the user
            similar_users = cosine_similarity(U, sample_train_sparse_matrix).ravel()
            # These are top users for the `user`
            top_users_sorted = np.argsort(-similar_users)[1::]
            # Now get the ratings for these top users for the `movie`
            R_user = sample_train_sparse_matrix[top_users_sorted, movie].toarray().ravel()
            R_user = R_user[R_user != 0][:10]
            R_user = R_user.tolist()
            R_user.extend([movieAvg[movie]] * (10 - len(R_user)))
        except (IndexError, KeyError):
            R_user.extend([GAvg] * (10 - len(R_user)))
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
        ## Get the top ratings of the user who rated similar movies ##
        # User constant, movies changing
        
        try:
            M = sample_train_sparse_matrix[:, movie]
            similar_movies = cosine_similarity(M.T, sample_train_sparse_matrix.T).ravel()
            top_movies_sorted = np.argsort(-similar_movies)[1::]
            R_movie = sample_train_sparse_matrix[user, top_movies_sorted].toarray().ravel()
            R_movie = R_movie[R_movie != 0][:10]
            R_movie = R_movie.tolist()
            R_movie.extend([userAvg[user]] * (10 - len(R_movie)))
        except (IndexError, KeyError):
            R_movie.extend([GAvg] * (10 - len(R_movie)))
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

        row.extend([user, movie])
        row.extend(R_user)
        row.extend(R_movie)
        try:
            mavg = movieAvg[movie]
        except KeyError:
            mavg = GAvg
        try:
            uavg = userAvg[user]
        except KeyError:
            uavg = GAvg
        
        row.extend([mavg, uavg, GAvg])
        row.extend([rating])

        testsets.append(row)
    print('Worker = {} is completed'.format(worker))

if __name__ == '__main__':
    manager = Manager()
    testsets = manager.list()
    jobs = []
    for i in range(18):
        p = Process(target=get_test_data, args=(i, 
                                                 sample_test_users[i*2001:(i+1)*2001],
                                                 sample_test_movies[i*2001:(i+1)*2001],
                                                 sample_test_ratings[i*2001:(i+1)*2001],
                                                 testsets,))
        jobs.append(p)
        p.start()
    for proc in jobs:
        proc.join()

Worker = 1 is completed
Worker = 13 is completed
Worker = 12 is completed
Worker = 3 is completed
Worker = 5 is completed
Worker = 2 is completed
Worker = 10 is completed
Worker = 8 is completed
Worker = 14 is completed
Worker = 7 is completed
Worker = 4 is completed
Worker = 0 is completed
Worker = 15 is completed
Worker = 9 is completed
Worker = 11 is completed
Worker = 17 is completed
Worker = 16 is completed
Worker = 6 is completed
CPU times: user 100 ms, sys: 368 ms, total: 468 ms
Wall time: 15min 47s


In [54]:
assert(len(testsets) == len(sample_test_users))

In [55]:
with open('./dataset/test_10000_1000.csv', 'w') as csvFile:
    csvwriter = csv.writer(csvFile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for each in tqdm(list(testsets), total=len(sample_test_users)):
        csvwriter.writerow(each)

100%|██████████| 36017/36017 [00:00<00:00, 36265.54it/s]


#### On the training dataset of dimension (25000 x 3000)

In [60]:
sample_train_sparse_matrix = load_npz('./dataset/sample_sparse_matrix_25000_3000.npz')

In [62]:
sample_train_users, sample_train_movies, sample_train_ratings = find(sample_train_sparse_matrix)

In [63]:
sample_train_users.shape

(856986,)

In [64]:
userAvg = get_average_ratings(sample_train_sparse_matrix, of_users=True)

In [65]:
userAvg[917504]

4.5

In [66]:
movieAvg = get_average_ratings(sample_train_sparse_matrix, of_users=False)

In [67]:
movieAvg[16389]

2.2017543859649122

In [68]:
GAvg = sample_train_sparse_matrix.sum()/sample_train_sparse_matrix.count_nonzero()

In [69]:
GAvg

3.5875813607223455

In [70]:
len(sample_train_ratings)

856986

In [178]:
18 * 47615

857070

In [None]:
%%time
# Overnight Computation: Approx time - 10 hours approx.

def get_train_data(worker, all_users, all_movies, all_ratings, trainsets):

    for user, movie, rating in zip(all_users, 
                                   all_movies, 
                                   all_ratings):
        
#         print(user, movie, rating)
        ## Get the ratings from top 10 similar users who rated the same movie ##
        # Movie constant, users changing
        row = []
        U = sample_train_sparse_matrix[user] # movie vector the user
        similar_users = cosine_similarity(U, sample_train_sparse_matrix).ravel()
        # These are top users for the `user`
        top_users_sorted = np.argsort(-similar_users)[1::]
        # Now get the ratings for these top users for the `movie`
        R_user = sample_train_sparse_matrix[top_users_sorted, movie].toarray().ravel()
        R_user = R_user[R_user != 0][:10]
        R_user = R_user.tolist()
        R_user.extend([movieAvg[movie]] * (10 - len(R_user)))
        ## Get the top ratings of the user who rated similar movies ##
        # User constant, movies changing

        M = sample_train_sparse_matrix[:, movie]
        similar_movies = cosine_similarity(M.T, sample_train_sparse_matrix.T).ravel()
        top_movies_sorted = np.argsort(-similar_movies)[1::]
        R_movie = sample_train_sparse_matrix[user, top_movies_sorted].toarray().ravel()
        R_movie = R_movie[R_movie != 0][:10]
        R_movie = R_movie.tolist()
        R_movie.extend([userAvg[user]] * (10 - len(R_movie)))


        row.extend([user, movie])
        row.extend(R_user)
        row.extend(R_movie)
        row.extend([movieAvg[movie], userAvg[user], GAvg])
        row.extend([rating])

        trainsets.append(row)
    print('Worker = {} is completed'.format(worker))

if __name__ == '__main__':
    manager = Manager()
    trainsets = manager.list()
    jobs = []
    for i in range(18):
        p = Process(target=get_train_data, args=(i, 
                                                 sample_train_users[i*47615:(i+1)*47615],
                                                 sample_train_movies[i*47615:(i+1)*47615],
                                                 sample_train_ratings[i*47615:(i+1)*47615],
                                                 trainsets,))
        jobs.append(p)
        p.start()
    for proc in jobs:
        proc.join()

In [180]:
assert(len(trainsets) == len(sample_train_users))

In [None]:
with open('./dataset/train_25000_3000.csv', 'w') as csvFile:
    csvwriter = csv.writer(csvFile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for each in tqdm(list(trainsets), total=len(sample_train_users)):
        csvwriter.writerow(each)

In [183]:
!wc -l ./dataset/train_25000_3000.csv

856986 ./dataset/train_25000_3000.csv


#### On the testing dataset of dimension (25000 x 3000)

In [56]:
sample_test_sparse_matrix = load_npz('./dataset/sample_sparse_test_matrix_25000_3000.npz')

In [57]:
sample_test_sparse_matrix.shape

(2649405, 17761)

In [58]:
sample_test_users, sample_test_movies, sample_test_ratings = find(sample_test_sparse_matrix)

In [71]:
len(sample_test_users)

261693

In [75]:
18 * 14539 

261702

In [76]:
%%time
def get_test_data(worker, all_users, all_movies, all_ratings, testsets):

    for user, movie, rating in zip(all_users, 
                                    all_movies, 
                                    all_ratings):
        
        ## Get the ratings from top 10 similar users who rated the same movie ##
        # Movie constant, users changing
        row = []
        try:
            U = sample_train_sparse_matrix[user] # movie vector the user
            similar_users = cosine_similarity(U, sample_train_sparse_matrix).ravel()
            # These are top users for the `user`
            top_users_sorted = np.argsort(-similar_users)[1::]
            # Now get the ratings for these top users for the `movie`
            R_user = sample_train_sparse_matrix[top_users_sorted, movie].toarray().ravel()
            R_user = R_user[R_user != 0][:10]
            R_user = R_user.tolist()
            R_user.extend([movieAvg[movie]] * (10 - len(R_user)))
        except (IndexError, KeyError):
            R_user.extend([GAvg] * (10 - len(R_user)))
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
        ## Get the top ratings of the user who rated similar movies ##
        # User constant, movies changing
        
        try:
            M = sample_train_sparse_matrix[:, movie]
            similar_movies = cosine_similarity(M.T, sample_train_sparse_matrix.T).ravel()
            top_movies_sorted = np.argsort(-similar_movies)[1::]
            R_movie = sample_train_sparse_matrix[user, top_movies_sorted].toarray().ravel()
            R_movie = R_movie[R_movie != 0][:10]
            R_movie = R_movie.tolist()
            R_movie.extend([userAvg[user]] * (10 - len(R_movie)))
        except (IndexError, KeyError):
            R_movie.extend([GAvg] * (10 - len(R_movie)))
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #

        row.extend([user, movie])
        row.extend(R_user)
        row.extend(R_movie)
        try:
            mavg = movieAvg[movie]
        except KeyError:
            mavg = GAvg
        try:
            uavg = userAvg[user]
        except KeyError:
            uavg = GAvg
        
        row.extend([mavg, uavg, GAvg])
        row.extend([rating])

        testsets.append(row)
    print('Worker = {} is completed'.format(worker))

if __name__ == '__main__':
    manager = Manager()
    testsets = manager.list()
    jobs = []
    for i in range(18):
        p = Process(target=get_test_data, args=(i, 
                                                 sample_test_users[i*14539:(i+1)*14539],
                                                 sample_test_movies[i*14539:(i+1)*14539],
                                                 sample_test_ratings[i*14539:(i+1)*14539],
                                                 testsets,))
        jobs.append(p)
        p.start()
    for proc in jobs:
        proc.join()

Worker = 15 is completed
Worker = 16 is completed
Worker = 17 is completed
Worker = 8 is completed
Worker = 14 is completed
Worker = 13 is completed
Worker = 10 is completed
Worker = 11 is completed
Worker = 9 is completed
Worker = 2 is completed
Worker = 12 is completed
Worker = 5 is completed
Worker = 3 is completed
Worker = 6 is completed
Worker = 0 is completed
Worker = 1 is completed
Worker = 4 is completed
Worker = 7 is completed
CPU times: user 500 ms, sys: 472 ms, total: 972 ms
Wall time: 2h 43min 17s


In [77]:
assert(len(testsets) == len(sample_test_users))

In [78]:
with open('./dataset/test_25000_3000.csv', 'w') as csvFile:
    csvwriter = csv.writer(csvFile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    for each in tqdm(list(testsets), total=len(sample_test_users)):
        csvwriter.writerow(each)

100%|██████████| 261693/261693 [00:07<00:00, 34412.85it/s]


In [3]:
reg_train_df = pd.read_csv('./dataset/train_25000_3000.csv', 
                           names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)

In [4]:
reg_train_df.head()

Unnamed: 0,user,movie,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,...,smr5,smr6,smr7,smr8,smr9,smr10,MAvg,UAvg,GAvg,rating
0,174683,10,5.0,5.0,3.0,4.0,4.0,4.0,4.0,4.0,...,2.0,4.0,3.0,4.0,2.0,5.0,3.611111,3.882353,3.587581,5
1,1555957,1266,3.0,5.0,5.0,4.0,2.0,3.0,3.0,3.0,...,5.0,4.0,3.0,5.0,4.0,3.0,3.163265,3.714286,3.587581,3
2,659804,2779,4.0,3.0,2.0,3.0,3.0,4.0,2.0,3.0,...,5.0,4.0,3.0,4.0,4.0,4.0,2.774065,2.945946,3.587581,2
3,1948839,3368,2.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,...,5.0,3.0,4.0,4.0,3.0,4.0,3.5152,3.850467,3.587581,4
4,1939698,5317,4.0,4.0,4.0,4.0,3.0,2.0,4.0,4.0,...,3.0,4.0,3.0,4.0,4.0,5.0,3.386404,3.666667,3.587581,2


In [5]:
reg_train_df.shape

(856986, 26)

In [6]:
reg_test_df = pd.read_csv('./dataset/test_25000_3000.csv', 
                           names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)

In [7]:
reg_test_df.head()

Unnamed: 0,user,movie,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,...,smr5,smr6,smr7,smr8,smr9,smr10,MAvg,UAvg,GAvg,rating
0,1129620,2,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,...,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3
1,2407458,5582,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,...,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,4
2,668855,6677,2.0,5.0,1.0,4.0,3.0,3.0,3.0,4.0,...,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.376,3.587581,3.587581,3
3,1704929,3756,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,...,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,4
4,2587366,8915,5.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,...,3.587581,3.587581,3.587581,3.587581,3.587581,3.587581,3.979495,3.587581,3.587581,4
