# Setup

## Imports

In [10]:
import pandas as pd
import chardet
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [24]:
def import_titles():
    df = pd.read_csv('movie_titles.txt', 
                     encoding="ISO-8859-1", 
                     usecols=[0, 1, 2])
    return df
    
def import_ratings(n):
    df = pd.read_csv('netflix_ratings.txt', 
                     encoding="ascii", 
                     sep=' ',
                     header=None,
                     names=["M_ID", "M_Year", "U_ID", "U_Rating", "U_Date"])
    df = df[df["M_ID"].between(1, n)]
    return df

In [38]:
titles = import_titles()
ratings = import_ratings(500)

In [44]:
print("Shape of titles:",titles.shape)
print("Columns:",titles.columns,"\n")

print("Shape of ratings:",ratings.shape)
print("Columns:",ratings.columns)

Shape of titles: (17770, 3)
Columns: Index(['Movie_ID', 'Year', 'Title'], dtype='object') 

Shape of ratings: (2798704, 5)
Columns: Index(['M_ID', 'M_Year', 'U_ID', 'U_Rating', 'U_Date'], dtype='object')


## Prepare dataframe for PCA

In [60]:
def prepare_PCA_df(n):

    # Create PCA_df directly using pivot
    df = pd.pivot(ratings, values='U_Rating', index='M_ID', columns='U_ID')

    # Drop columns with less than a n of non-NaN values
    thresh = len(df) / n
    df.dropna(axis=1, thresh=thresh, inplace=True)

    # Fill NaN values with 0
    df.fillna(0, inplace=True)

    return df

In [61]:
pre_PCA = prepare_PCA_df(4) # Create df and remove a quarter of NaN-values

## PCA

In [63]:
def do_PCA(datadrame):
    # Normalisation 
    normalized_PCA_df = datadrame - datadrame.mean(axis=0)
    
    # compute the covariance matrix of the standardized data
    covariance_matrix = np.cov(normalized_PCA_df.T)

    # compute the eigenvectors and eigenvalues of the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)

    # sort the eigenvalues in descending order
    sorted_indices = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalues = eigenvalues[sorted_indices]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]

    # select the principal components to keep
    total_variance = np.sum(sorted_eigenvalues)
    variance_explained = sorted_eigenvalues / total_variance
    cumulative_variance = np.cumsum(variance_explained)

    # keep the principal components that explain 95% of the variance
    num_components = np.argmax(cumulative_variance >= 0.95) + 1
    #num_components = normalized_PCA_df.shape[1]
    principal_components = sorted_eigenvectors[:, :num_components]

    # project the data onto the selected principal components
    transformed_data = np.dot(normalized_PCA_df, principal_components)
    
    return pd.DataFrame(transformed_data)

## Create PCA dataframes

In [67]:
# Calculate the midpoint for column splitting
midpoint = len(pre_PCA.columns) // 2

# Split the dataframe into two halves
df1 = pre_PCA.iloc[:, :midpoint]  # First half of columns
df2 = pre_PCA.iloc[:, midpoint:]  # Second half of columns

# Calculate respective PCAs for each dataframe
pca_1 = do_PCA(df1)
pca_2 = do_PCA(df2)

display(pca_1)
display(pca_2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.483385,1.574123,-2.661245,-2.276674,3.283522,-2.590626,-0.858178,2.410833,2.775346,-0.372444,...,-1.382462,0.178120,-1.224044,-1.730874,-0.103345,-1.975272,1.879007,1.005769,0.211213,-1.171026
1,-4.766807,-0.271768,-0.044809,-0.421963,-0.433614,0.629026,-0.193027,0.368495,-0.167958,0.255886,...,0.060476,-0.278286,0.201610,0.600844,0.210900,-0.748572,-0.265460,-1.209955,0.379234,-0.584811
2,0.987577,4.546462,1.596311,4.837517,0.167838,0.947707,-1.705757,0.003598,0.056536,-0.342059,...,-1.163722,-1.630186,0.646309,-0.624860,0.549158,-0.668809,-2.224108,0.166021,-2.533516,-1.184933
3,-4.101608,-0.022484,-0.463794,-0.956680,-0.083218,0.672922,-0.262183,-0.534805,0.725591,-0.530189,...,-0.099348,0.053688,0.760439,0.431391,-0.213301,-1.507732,-0.121687,-0.130369,-1.094284,-0.365438
4,-3.383174,-1.955715,-0.098784,0.210003,0.759653,-0.845733,0.996715,1.768485,-1.464237,0.871122,...,-0.724186,0.203944,-0.679443,0.555562,0.397730,-0.087290,-0.688008,-0.993911,0.436749,-1.079240
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-4.685093,-0.478948,0.186786,-0.190737,-0.509599,0.050607,-0.415202,-0.808407,0.541815,-0.240584,...,-0.193691,-0.254096,-0.711565,-0.120715,-0.334747,-0.011851,-0.447030,-0.187222,0.018207,0.014806
496,-0.586096,4.468874,-2.412566,-0.687255,-1.934497,0.762174,1.868327,-0.285368,-1.369045,-0.167308,...,1.508769,-2.073976,-0.380266,-0.111786,0.702700,-0.241277,0.251670,-0.387638,-0.699422,0.187721
497,-3.880725,0.638265,-0.934222,-0.190972,1.404797,0.010748,-0.856560,-1.190363,0.781351,0.777897,...,0.515024,0.275578,0.262418,0.201255,0.621610,-0.236266,0.689997,-0.722418,2.926721,0.642054
498,3.620985,-1.233448,-3.456682,2.336982,2.071478,-0.347124,2.017690,-1.293540,2.028469,-2.140494,...,1.055089,0.064187,-0.915041,-0.471350,0.329504,0.503437,1.708079,-0.211700,0.423422,0.447887


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.526436,0.151696,2.581266,0.697627,-1.829819,1.108556,0.398293,-1.800158,1.185083,-2.414726,...,2.476722,0.664751,-1.296946,-0.332735,-3.413200,-0.244399,-1.279585,0.244998,1.331062,-0.374427
1,-2.413920,1.520597,2.082305,0.401899,-1.047858,-1.430418,-2.470724,-1.608359,-0.431285,1.030620,...,0.908515,1.109334,0.036547,0.350220,0.760363,-0.161180,0.421894,0.140224,-0.247003,0.550578
2,1.308753,-5.016001,-1.096371,-1.912581,0.177943,-2.217077,0.280501,1.845468,-1.010159,1.020367,...,-1.091067,1.709020,0.575070,0.382515,-1.094096,0.484200,0.256976,0.247772,-0.801977,-0.459427
3,-4.603384,0.094866,0.075104,-0.083286,0.195402,-0.486119,0.037044,-0.281090,-0.534581,-0.551974,...,0.318807,-0.089294,0.133963,-0.055948,0.016799,0.344821,-0.472453,-0.609603,-0.404495,0.379100
4,-1.912283,1.150634,1.153155,-2.803639,-0.673918,-0.938076,-1.089286,-0.590160,0.801714,-0.696162,...,-0.642465,0.488800,-0.060371,1.003050,1.648522,0.885549,-1.714744,0.014666,-0.106609,-0.873824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,-3.296766,1.705478,0.686569,2.482625,-0.670608,0.312074,-0.144591,-0.882616,-0.772805,0.530946,...,0.743808,0.040293,-0.583769,0.130097,0.356980,-0.270887,0.478319,0.863745,0.125546,-0.047179
496,-2.301476,-1.421701,-1.494928,1.703994,-0.448387,0.801405,0.168239,0.920500,0.293058,-3.025168,...,-1.100016,1.275937,1.027638,1.838218,-1.516327,-1.886152,-0.489241,-1.036407,0.437548,-1.411399
497,-5.139353,0.097094,-2.073024,3.084743,-0.631693,-0.325753,0.560490,0.864933,0.143933,-0.774864,...,-0.451982,-1.374591,0.975128,-1.071194,0.383309,1.688406,1.151992,0.728371,-0.950995,-0.209423
498,6.424219,0.195248,-2.945363,0.403376,1.092176,1.049340,1.091420,0.563229,-0.206420,-3.173488,...,1.034732,-0.227710,-0.635060,-1.805488,-1.694272,-0.248970,0.536519,-1.128054,1.687412,1.339133
