In [68]:
# Imports
import pandas as pd
import chardet
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [28]:
movie_titles = pd.read_csv('movie_titles.txt', 
    encoding="ISO-8859-1", 
    sep = ',',
    usecols=[0, 1, 2])
display(movie_titles)

Unnamed: 0,Movie_ID,Year,Title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [5]:
netflix_ratings = pd.read_csv('netflix_ratings.txt', 
    encoding="ascii", 
    sep = ' ', 
    header=None)
movie_count = netflix_ratings.iloc[:,0].nunique()
print("Number of movies:", movie_count)

Number of movies: 17770


In [7]:
# Used to select only a subset of the entire dataset of ranked movies
netflix_ratings = netflix_ratings[netflix_ratings.iloc[:, 0].between(1, 500)]
netflix_ratings.columns = ["M_ID", "M_Year", "U_ID", "U_Rating", "U_Date"]
display(netflix_ratings)

Unnamed: 0,M_ID,M_Year,U_ID,U_Rating,U_Date
0,1,2003.0,1488844,3,2005-09-06
1,1,2003.0,822109,5,2005-05-13
2,1,2003.0,885013,4,2005-10-19
3,1,2003.0,30878,4,2005-12-26
4,1,2003.0,823519,3,2004-05-03
...,...,...,...,...,...
2798699,500,2002.0,651950,4,2005-06-28
2798700,500,2002.0,924510,3,2005-07-12
2798701,500,2002.0,965381,3,2005-08-19
2798702,500,2002.0,822391,1,2004-11-04


In [29]:
def get_usr_ratings(id):
    if netflix_ratings['U_ID'].isin([id]).any():
        temp_df = netflix_ratings[netflix_ratings['U_ID'] == id]
        merged_df = pd.merge(temp_df, movie_titles, left_on='M_ID', right_on='Movie_ID')
        merged_df.set_index('M_ID', inplace=True)
        merged_df["M_Year"] = merged_df["M_Year"].astype(int)
        merged_df["Year"] = merged_df["Year"].astype(int)
        display(merged_df)
    else:
        print("User with id", id, "does not exist. Try another one.") 

In [30]:
print(netflix_ratings.columns)
print(movie_titles.columns)

Index(['M_ID', 'M_Year', 'U_ID', 'U_Rating', 'U_Date'], dtype='object')
Index(['Movie_ID', 'Year', 'Title'], dtype='object')


## Create dataframe for PCA

In [32]:
#merging netflic ratings and movie titles (actually not useful at all)
merged_df = pd.merge(netflix_ratings, movie_titles, left_on='M_ID', right_on='Movie_ID')

#making data frame where all movie ratings are given in rows with all user ratings as cloumns
PCA_df = pd.pivot_table(merged_df, values='U_Rating', index='M_ID', columns='U_ID')

#droping all columns where all values are Nan. These columns are not useful to us.
#PCA_df = PCA_df.dropna(axis=1, how='all')

#Ended up dropping all columns where there are less then a fourt of the column with actual values, 
#dont know if i should actually do this, but it is easy to correct.
PCA_df = PCA_df.dropna(axis=1, thresh=len(PCA_df) / 4)


#replacing NaN with 0, to be able to calculate mean
PCA_df = PCA_df.fillna(0)

display(PCA_df)

U_ID,16272,57633,303948,305344,322009,387418,491531,504620,507603,525356,...,2147527,2237185,2238060,2291306,2297136,2439493,2457095,2537543,2606799,2625420
M_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,1.0,3.0,1.0,5.0,2.0,0.0,2.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0
3,4.0,4.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1.0,4.0,0.0,2.0,0.0,1.0,1.0,0.0,2.0,0.0
4,2.0,0.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3.0,2.0
497,0.0,4.0,0.0,1.0,0.0,3.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
498,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0
499,3.0,1.0,3.0,3.0,3.0,2.0,4.0,1.0,1.0,0.0,...,0.0,3.0,5.0,1.0,0.0,1.0,3.0,4.0,3.0,2.0


## Make it column wise zero empirical mean (normalizing)

In [43]:
#Ensuring column wise zero empirical mean by subtracting the mean of each column from each value in the column.
normalized_PCA_df = PCA_df - PCA_df.mean(axis=0)
display(normalized_PCA_df)

U_ID,16272,57633,303948,305344,322009,387418,491531,504620,507603,525356,...,2147527,2237185,2238060,2291306,2297136,2439493,2457095,2537543,2606799,2625420
M_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.076,-0.878,-1.414,-0.8,1.958,-0.744,3.874,1.428,-0.482,1.222,...,-0.68,-0.976,-1.368,0.43,-0.452,-0.124,-0.806,4.28,-1.332,-0.822
2,-0.924,-0.878,-1.414,-0.8,-1.042,-0.744,-1.126,-0.572,0.518,-0.778,...,0.32,-0.976,-1.368,-0.57,-0.452,-0.124,-0.806,-0.72,-0.332,1.178
3,3.076,3.122,-1.414,0.2,-1.042,0.256,-1.126,-0.572,-0.482,-0.778,...,0.32,3.024,-1.368,1.43,-0.452,-0.124,0.194,-0.72,0.668,-0.822
4,1.076,-0.878,0.586,-0.8,-1.042,0.256,-1.126,-0.572,-0.482,-0.778,...,-0.68,-0.976,-1.368,0.43,-0.452,-0.124,-0.806,-0.72,-1.332,-0.822
5,-0.924,-0.878,-1.414,-0.8,-1.042,-0.744,-0.126,0.428,0.518,-0.778,...,0.32,-0.976,-1.368,-0.57,-0.452,-0.124,-0.806,-0.72,-0.332,-0.822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,-0.924,-0.878,-1.414,-0.8,-1.042,-0.744,-1.126,-0.572,0.518,-0.778,...,0.32,-0.976,-1.368,-0.57,-0.452,-0.124,-0.806,0.28,1.668,1.178
497,-0.924,3.122,-1.414,-0.8,-1.042,1.256,-1.126,-0.572,0.518,-0.778,...,-0.68,-0.976,-1.368,-0.57,-0.452,-0.124,-0.806,-0.72,-1.332,1.178
498,-0.924,-0.878,-1.414,1.2,-1.042,1.256,-1.126,-0.572,-0.482,-0.778,...,-0.68,-0.976,-1.368,-0.57,-0.452,-0.124,3.194,-0.72,-1.332,-0.822
499,2.076,0.122,1.586,1.2,1.958,0.256,2.874,0.428,0.518,-0.778,...,-0.68,2.024,3.632,0.43,-0.452,-0.124,2.194,3.28,1.668,1.178


## PCA

In [None]:
pca = PCA(n_components = 10)
display(pd.DataFrame(pca.fit_transform(normalized_PCA_df)))

In [85]:
# Normalisation 
normalized_PCA_df = PCA_df - PCA_df.mean(axis=0)

# compute the covariance matrix of the standardized data
covariance_matrix = np.cov(normalized_PCA_df.T)

# compute the eigenvectors and eigenvalues of the covariance matrix
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

# sort the eigenvalues in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

# select the principal components to keep
total_variance = np.sum(sorted_eigenvalues)
variance_explained = sorted_eigenvalues / total_variance
cumulative_variance = np.cumsum(variance_explained)

# keep the principal components that explain 95% of the variance
#num_components = np.argmax(cumulative_variance >= 0.95) + 1
num_components = normalized_PCA_df.shape[1]
principal_components = sorted_eigenvectors[:, :num_components]

# project the data onto the selected principal components
transformed_data = np.dot(normalized_PCA_df, principal_components)

X = pd.DataFrame(transformed_data)
X.columns = normalized_PCA_df.columns
X.index += 1
display(X)

U_ID,16272,57633,303948,305344,322009,387418,491531,504620,507603,525356,...,2147527,2237185,2238060,2291306,2297136,2439493,2457095,2537543,2606799,2625420
1,0.661080,-1.190004,-3.173268,0.133291,1.174563,1.642050,-3.879779,-0.266048,-0.968725,5.590568,...,-0.056815,1.269757,-0.133524,0.209999,-0.468660,0.043347,0.639017,0.694079,0.553885,0.564841
2,-5.204676,1.471293,-1.383176,1.781137,-0.610600,-0.968898,-0.030702,-0.488112,0.521751,0.418734,...,0.194638,0.265372,0.237253,-0.368028,0.613349,-0.258188,0.050294,0.402809,-0.276737,-0.342761
3,1.642289,-6.143146,3.914321,-0.082383,-3.401981,-1.567582,-0.243515,1.718900,-1.089269,-0.970254,...,-0.654558,1.354584,0.902368,0.758740,0.035609,0.295847,0.130501,0.112355,0.603643,-0.317888
4,-6.104408,-0.048418,-0.476455,0.055741,0.776943,0.278967,0.268459,0.344925,-0.508780,-0.567231,...,-0.054681,0.621964,-0.242790,-0.053701,-0.523780,-0.605598,-0.130538,0.337140,0.302481,-0.052176
5,-3.797008,2.105373,-0.669235,-0.065920,-2.209283,0.392270,-1.086059,-1.963380,-1.538563,1.453653,...,0.653580,-0.098608,-0.474414,0.313007,-0.480957,0.108077,0.031625,0.556284,0.064374,0.557985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,-5.726406,1.703361,-0.849463,0.799656,1.230859,-0.702149,0.060258,0.757908,1.725790,-0.163327,...,0.321442,-0.017357,0.734278,0.233627,0.677498,-0.194720,0.128211,0.611326,-0.312336,0.777739
497,-1.936991,-4.546374,0.126931,-0.316129,1.181818,1.563495,0.485917,-1.055289,3.795496,-0.827178,...,0.211769,-1.422662,-1.305801,-0.296030,0.514083,0.590660,0.034730,0.152745,0.150093,0.207745
498,-6.277217,-0.599357,0.422103,-2.540633,3.682342,-1.134339,-1.160738,1.339314,0.685377,-0.846438,...,-0.468199,-0.983821,-0.858950,-1.134350,1.596851,0.202609,0.322568,-1.013532,1.062520,0.584658
499,6.997943,0.464158,-0.636984,-5.526613,0.549820,1.604642,0.803876,-0.284566,-0.236875,1.451385,...,-0.694460,0.371704,0.380359,-0.247091,0.991853,-0.216694,-0.253642,0.479685,-0.251681,0.284037


In [94]:
from scipy.spatial.distance import pdist, squareform

# compute pairwise Euclidean distance between all rows of X
dist_matrix = squareform(pdist(X, metric='euclidean'))

# print the distance matrix
print(dist_matrix)


[[ 0.         14.59451952 18.02775638 ... 16.37070554 15.45962483
  15.68438714]
 [14.59451952  0.         15.49193338 ... 10.90871211 17.94435844
  11.44552314]
 [18.02775638 15.49193338  0.         ... 17.11724277 16.85229955
  18.13835715]
 ...
 [16.37070554 10.90871211 17.11724277 ...  0.         17.4642492
  14.28285686]
 [15.45962483 17.94435844 16.85229955 ... 17.4642492   0.
  18.08314132]
 [15.68438714 11.44552314 18.13835715 ... 14.28285686 18.08314132
   0.        ]]
