# **Using NNMF to predict missing ratings**


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dataset_full = pd.read_csv('/content/rating.csv')

In [3]:
min_value_user = dataset_full['userId'].min()
max_value_user = dataset_full['userId'].max()
min_value_movie = dataset_full['movieId'].min()
max_value_movie = dataset_full['movieId'].max()
print(min_value_user,max_value_user,min_value_movie,max_value_movie)

1 4038 1 130642


In [4]:
dataset_full.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602740 entries, 0 to 602739
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     602740 non-null  int64  
 1   movieId    602740 non-null  int64  
 2   rating     602740 non-null  float64
 3   timestamp  602740 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 18.4+ MB


In [5]:
#Getting only the first 1000 rows of the movie lense dataset
global data_count
data_count = 10000 
dataset = dataset_full.head(data_count)

In [6]:
dataset.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     10000 non-null  int64  
 1   movieId    10000 non-null  int64  
 2   rating     10000 non-null  float64
 3   timestamp  10000 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 312.6+ KB


In [7]:
#Removing the Time stamp column as it is not needed
dataset.drop('timestamp', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [8]:
dataset['movieId'] = dataset['movieId'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['movieId'] = dataset['movieId'].astype(int)


In [9]:
min_value_user = dataset['userId'].min()
max_value_user = dataset['userId'].max()
min_value_movie = dataset['movieId'].min()
max_value_movie = dataset['movieId'].max()
print(min_value_user,max_value_user,min_value_movie,max_value_movie)

1 91 1 125916


In [10]:
#Creating a newdataframe to add the corresponding ratings.
new_df = pd.DataFrame(columns=range(1,max_value_movie+1), index=range(1,max_value_user+1))

In [11]:
new_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,125907,125908,125909,125910,125911,125912,125913,125914,125915,125916
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [12]:
new_df.shape

(91, 125916)

In [13]:
dataset.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [14]:
dataset.shape

(10000, 3)

In [15]:
#Iterating through all rows of the dataset and adding the corresponding rating to the new dataset
for i in range(dataset.shape[0]):
  x = dataset.iloc[i, 0]
  y = dataset.iloc[i, 1]
  z = dataset.iloc[i, 2]
  new_df.iloc[x-1, y-1] = z

In [16]:
new_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,125907,125908,125909,125910,125911,125912,125913,125914,125915,125916
1,,3.5,,,,,,,,,...,,,,,,,,,,
2,,,4.0,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,4.0,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [17]:
#Since there are 31,696 movies, I'm going to reduce the number of movies to 100 in order to make the predictions more accurate
final_df = new_df.iloc[:,0:100]

In [18]:
final_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
1,,3.5,,,,,,,,,...,,,,,,,,,,
2,,,4.0,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,4.0,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [19]:
#total missing values in each column
missing_values = final_df.isnull().sum()
missing_values

1      63
2      83
3      81
4      89
5      85
       ..
96     91
97     90
98     91
99     91
100    89
Length: 100, dtype: int64

In [20]:
#Filling missing values with 0s
final_df.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [21]:
final_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0


In [22]:
R = np.array(final_df.values)

In [23]:
R.shape

(91, 100)

In [24]:
import time

#Matrix factorizing function using NNMF
def matrix_factorization(R, P, Q, K, steps=101, alpha=0.0002, beta=0.02):
    Q = Q.T
    start_time = time.time()
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = np.dot(P,Q)

        e = 0

        for i in range(len(R)):

            for j in range(len(R[i])):

                if R[i][j] > 0:

                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)

                    for k in range(K):

                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))

        
        if(step >0 and step%((steps - 1)/10) == 0):
          loss = e/data_count
          loss = round(loss, 2)
          print(f'Epoch = {step}/{steps-1} Loss = {loss}')

        # 0.001: local minimum
        if e < 0.001:

            break

    end_time = time.time()
    elapsed_time = end_time - start_time
    round_time = round(elapsed_time, 2)
    print(f'Elapsed time = {round_time}s')
    return P, Q.T

In [25]:
# N: num of User
N = len(R)
# M: num of Movie
M = len(R[0])
# Num of Features
K = 10

 
#P = np.random.rand(N,K)
#Q = np.random.rand(M,K)
P = np.random.uniform(low=0.0, high=5.0, size=(N, K))
Q = np.random.uniform(low=0.0, high=5.0, size=(M, K))

 

nP, nQ = matrix_factorization(R, P, Q, K)

nR = np.dot(nP, nQ.T)

Epoch = 10/100 Loss = 4.02
Epoch = 20/100 Loss = 1.53
Epoch = 30/100 Loss = 0.84
Epoch = 40/100 Loss = 0.55
Epoch = 50/100 Loss = 0.4
Epoch = 60/100 Loss = 0.31
Epoch = 70/100 Loss = 0.26
Epoch = 80/100 Loss = 0.22
Epoch = 90/100 Loss = 0.19
Epoch = 100/100 Loss = 0.17
Elapsed time = 5.98s


In [26]:
rounded_matrix = np.round(nR, decimals=1)
final_preds = np.clip(rounded_matrix, a_min=None, a_max=5)

#Final matrix with predicted values for null values
final_preds

array([[ 3.1,  4. , -3. , ...,  5. ,  5. ,  5. ],
       [ 2.1,  5. ,  4.9, ...,  5. ,  5. ,  5. ],
       [ 4. ,  5. ,  4.6, ...,  5. ,  5. ,  5. ],
       ...,
       [ 5. ,  5. ,  5. , ...,  5. ,  5. ,  5. ],
       [ 3. ,  5. ,  5. , ...,  5. ,  5. ,  5. ],
       [ 1.4,  2. ,  4.5, ...,  5. ,  5. ,  5. ]])

In [27]:
final_preds[final_preds<0] = 0

In [28]:
n_zeros = np.count_nonzero(final_preds==0)
print(n_zeros)

257


In [29]:
#Total rating cells
Total_cells = final_preds.shape[0]*final_preds.shape[1]
print(Total_cells)

9100


In [30]:
#Ratio of cells with zeros compared to total cells
Ratio = n_zeros/Total_cells
rounded_ratio = np.round(Ratio, decimals=3)
print(rounded_ratio)

0.028


# **Using SVD to find recomendations for each movies**

In [31]:
#Normalizing the matrix
normalised_mat = final_preds - np.asarray([(np.mean(final_preds, 1))]).T

In [32]:
#Compute SVD
A = normalised_mat.T / np.sqrt(final_preds.shape[0] - 1)
U, S, V = np.linalg.svd(A)

In [33]:
#Getting the first 100 movies from the movies dataset
movie_dataset = pd.read_csv('/content/movie.csv')
movie_names = movie_dataset.head(100)

In [34]:
#Function to calculate the cosin distances between the movie inputted to it and other movies
def top_cosine_similarity(data, movie_id, top_n):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]


In [35]:
#Function to print the predicted recommended movies to the inputted movie
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('\nRecommendations for {0}: \n'.format(
    movie_names[movie_names.movieId == movie_id].title.values[0]))

    for id in top_indexes + 1:
        print(movie_names[movie_names.movieId == id].title.values[0])

In [36]:
#Function to interact with the user
def Recomend_Movie():
  Id = int(input("Index of the movie that you want to see recommendations(1 -90): "))
  top = int(input("How many recommendations would you like to see: "))
  k = 50
  movieId = Id# Grab an id from movies.dat
  top_n = top

  sliced = V.T[:, :k] # representative data
  indexes = top_cosine_similarity(sliced, movieId, top_n)

  print_similar_movies(movie_names, movieId, indexes)

In [37]:
import warnings
warnings.simplefilter('ignore')

# **Movie Names & Indexes:**

1 - Toy Story (1995)<br>
2 - Jumanji (1995)<br>
3 - Grumpier Old Men (1995)<br>
4 - Waiting to Exhale (1995)<br>
5 - Father of the Bride Part II (1995)<br>
6 - Heat (1995)<br>
7 - Sabrina (1995)<br>
8 - Tom and Huck (1995)<br>
9 - Sudden Death (1995)<br>
10 - GoldenEye (1995)<br>
11 - American President, The (1995)<br>
12 - Dracula: Dead and Loving It (1995)<br>
13 - Balto (1995)<br>
14 - Nixon (1995)<br>
15 - Cutthroat Island (1995)<br>
16 - Casino (1995)<br>
17 - Sense and Sensibility (1995)<br>
18 - Four Rooms (1995)<br>
19 - Ace Ventura: When Nature Calls (1995)<br>
20 - Money Train (1995)<br>
21 - Get Shorty (1995)<br>
22 - Copycat (1995)<br>
23 - Assassins (1995)<br>
24 - Powder (1995)<br>
25 - Leaving Las Vegas (1995)<br>
26 - Othello (1995)<br>
27 - Now and Then (1995)<br>
28 - Persuasion (1995)<br>
29 - City of Lost Children, The (CitÃ© des enfants perdus, La) (1995)<br>
30 - Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)<br>
31 - Dangerous Minds (1995)<br>
32 - Twelve Monkeys (a.k.a. 12 Monkeys) (1995)<br>
33 - Wings of Courage (1995)<br>
34 - Babe (1995)<br>
35 - Carrington (1995)<br>
36 - Dead Man Walking (1995)<br>
37 - Across the Sea of Time (1995)<br>
38 - It Takes Two (1995)<br>
39 - Clueless (1995)<br>
40 - Cry, the Beloved Country (1995)<br>
41 - Richard III (1995)<br>
42 - Dead Presidents (1995)<br>
43 - Restoration (1995)<br>
44 - Mortal Kombat (1995)<br>
45 - To Die For (1995)<br>
46 - How to Make an American Quilt (1995)<br>
47 - Seven (a.k.a. Se7en) (1995)<br>
48 - Pocahontas (1995)<br>
49 - When Night Is Falling (1995)<br>
50 - Usual Suspects, The (1995)<br>
51 - Guardian Angel (1994)<br>
52 - Mighty Aphrodite (1995)<br>
53 - Lamerica (1994)<br>
54 - Big Green, The (1995)<br>
55 - Georgia (1995)<br>
56 - Kids of the Round Table (1995)<br>
57 - Home for the Holidays (1995)<br>
58 - Postman, The (Postino, Il) (1994)<br>
59 - Confessional, The (Confessionnal, Le) (1995)<br>
60 - Indian in the Cupboard, The (1995)<br>
61 - Eye for an Eye (1996)<br>
62 - Mr. Holland's Opus (1995)<br>
63 - Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996)<br>
64 - Two if by Sea (1996)<br>
65 - Bio-Dome (1996)<br>
66 - Lawnmower Man 2: Beyond Cyberspace (1996)<br>
67 - Two Bits (1995)<br>
68 - French Twist (Gazon maudit) (1995)<br>
69 - Friday (1995)<br>
70 - From Dusk Till Dawn (1996)<br>
71 - Fair Game (1995)<br>
72 - Kicking and Screaming (1995)<br>
73 - MisÃ©rables, Les (1995)<br>
74 - Bed of Roses (1996)<br>
75 - Big Bully (1996)<br>
76 - Screamers (1995)<br>
77 - Nico Icon (1995)<br>
78 - Crossing Guard, The (1995)<br>
79 - Juror, The (1996)<br>
80 - White Balloon, The (Badkonake sefid) (1995)<br>
81 - Things to Do in Denver When You're Dead (1995)<br>
82- Antonia's Line (Antonia) (1995)<br>
83 - Once Upon a Time... When We Were Colored (1995)<br>
84 - Last Summer in the Hamptons (1995)<br>
85 - Angels and Insects (1995)<br>
86 - White Squall (1996)<br>
87 - Dunston Checks In (1996)<br>
88 - Black Sheep (1996)<br>
89 - Nick of Time (1995)<br>
90 - Journey of August King, The (1995)<br>


In [39]:
Recomend_Movie()

Index of the movie that you want to see recommendations(1 -90): 1
How many recommendations would you like to see: 10

Recommendations for Toy Story (1995): 

Toy Story (1995)
Nixon (1995)
White Balloon, The (Badkonake sefid) (1995)
Indian in the Cupboard, The (1995)
Journey of August King, The (1995)
Dead Presidents (1995)
Wings of Courage (1995)
When Night Is Falling (1995)
Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
Heat (1995)
