# Analyzing movie posters using the VGG16 model in Keras

In this notebook I use the VGG16 model as feature extractor. These features, characterizing each poster, are used to measure similarity at a group and a individual level.

## Part 1 - Extracting features using the VGG16

### Load meta data and posters

In [None]:
# libraries
import pandas as pd
import numpy as np

In [None]:
# set wd
import os
os.chdir("/Users/Mads/Documents/Poster Analysis/")

In [None]:
# load full movie dataset
movie_data = pd.read_csv("movie_dataset.csv", encoding="ISO-8859-1", usecols=["imdbId", "Title", "Genre"])
movie_data

In [None]:
# generate a list of poster paths for all posters in the folder containing downloaded posters
import os

# setting directory
directory = "/Users/Mads/Documents/AllPosters/"

# excludng .DS_Store which all mac os folders have
poster_paths = [os.path.join(directory, f) for f in os.listdir(directory)]
if directory + '.DS_Store' in poster_paths:
    poster_paths.remove(directory + '.DS_Store')
    
len(poster_paths)

In [None]:
# subset for testing
import random
random.seed(9)
n = 5000
r_poster_p = random.sample(poster_paths, n)

In [None]:
# generate a list of movie ids from the list of posters paths
# is used to print posters later

import re
poster_ids_temp = []

for i in poster_paths:
    poster_ids_temp.append(re.findall('\d+', i))

# make flat list out of the above id list
poster_ids = []

for sublist in poster_ids_temp:
    for item in sublist:
        poster_ids.append(item)

In [None]:
poster_ids.index('113277')

In [None]:
# generate a list of movie ids from the list of posters paths
# is used to print posters later

import re
sub_ids_temp = []

for i in r_poster_p:
    sub_ids_temp.append(re.findall('\d+', i))

# make flat list out of the above id list
sub_ids = []

for sublist in sub_ids_temp:
    for item in sublist:
        sub_ids.append(item)

In [None]:
sub_ids.index('113277')

### Preprocess all movie posters

Poster images are preprocessed using the build-in functions of the Keras library.

In [None]:
# preprocessing each movie poster
import numpy as np
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing import image as kimage

pre_img_list = []
n = 0

import os
for p in r_poster_p:
    orig_img = kimage.load_img(p, target_size=(224, 224))
    array_img = kimage.img_to_array(orig_img)
    batch_img = np.expand_dims(array_img, axis=0)
    pre_single = preprocess_input(batch_img)
    pre_img_list.append(pre_single)
    n = n+1
    print(n)

### Extract a feature vector for all posters

Note that the top layers of the model are not included. By default, the network has three fully connected layers and an output layer with 1000 categories. In this case, we are interested in using the model to extract features from the movie posters, leaving the pretrained weights unchanced. Therefore, we exclude the last layers and only use the features represented by the last max pooling layer.

In [None]:
# loading model
from keras.applications import VGG16
model = VGG16(include_top = False, weights='imagenet')

#model.summary()
#model.outputs

In [None]:
# prepare a prediction list and the output matrix 
prediction = [0]*len(r_poster_p)
feat_matrix = np.zeros([len(r_poster_p),25088])

# get features for each poster
for i in range(len(r_poster_p)):
    prediction[i] = model.predict(pre_img_list[i]).ravel()
    feat_matrix[i,:] = prediction[i] 
    print(i)

In [None]:
# convert to sparse matrix
from scipy import sparse
feat_csr = sparse.csr_matrix(feat_matrix)

In [None]:
# save full sparse matrix to disk
from scipy import sparse
sparse.save_npz('feat_csr.npz', feat_csr)

In [None]:
# SUBSET convert to sparse matrix
from scipy import sparse
sub_feat_csr = sparse.csr_matrix(feat_matrix)

In [None]:
# SUBSET save sparse matrix to disk
from scipy import sparse
sparse.save_npz('sub_feat_csr.npz', sub_feat_csr)

In [None]:
# load sparse matrix
from scipy import sparse
feat_csr = sparse.load_npz('feat_csr.npz')
feat_csr

In [None]:
# SUBSET load sparse matrix
from scipy import sparse
sub_feat_csr = sparse.load_npz('sub_feat_csr.npz')
sub_feat_csr

In [None]:
# convert sparse matrix back to dense
from scipy import sparse
sub_feat = sparse.csr_matrix.todense(sub_feat_csr)

## Part 2 - Clustering: measuring similarity at group level

Similarities at a group level can now be measured using the k-means clustering  algorithm. Since this is a case of unsupervised clustering with no ground truth, I will be using the inertia value and the Silhoutte coefficient as a performance measure.

In [None]:
# k-means clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn import metrics

kmeans = KMeans(n_clusters=10)
norm = Normalizer()
pipeline = make_pipeline(norm,kmeans)
norm_clusters = pipeline.fit_predict(sub_feat_csr)

In [None]:
# plotting balance of clusters
import matplotlib.pyplot as plt
plt.hist(norm_clusters)
plt.show()

In [None]:
# k-means
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn import preprocessing

norm_feat = preprocessing.normalize(sub_feat)
kmeans = KMeans(n_clusters=10, init='k-means++').fit(norm_feat)
labels = kmeans.labels_

In [None]:
# metrics
from sklearn import metrics
labels = kmeans.labels_
print(metrics.silhouette_score(sub_feat, labels, metric='euclidean'))
print(kmeans.inertia_)

In [None]:
# kmeans with a range of k's and saving inertia and silhouette score
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn import metrics


ks = range(5,26)
inertias = []
sils = []
norm = Normalizer()

for k in ks:
    kmeans = KMeans(n_clusters = k)
    pipeline = make_pipeline(norm,kmeans)
    pipeline.fit(sub_feat)
    inertia = pipeline.named_steps['kmeans'].inertia_
    inertias.append(inertia)
    labels = kmeans.labels_
    sils.append(metrics.silhouette_score(sub_feat, labels, metric='euclidean'))
    print(k)

In [None]:
# save intertia list
import numpy
numpy.savetxt("file_inertias.csv", inertias, delimiter=",")

In [None]:
# load inertia list
import numpy
inertias = numpy.loadtxt("file_inertias.csv", delimiter=",")

In [None]:
# save sils list
import numpy
numpy.savetxt("file_sils.csv", sils, delimiter=",")

In [None]:
# load sils list
import numpy
sils = numpy.loadtxt("file_sils.csv", delimiter=",")

In [None]:
# Plotting inertia for a range of ks
ks = range(5,26)
import matplotlib.pyplot as plt
plt.plot(ks,inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
plt.savefig('inertia.png')
plt.rcParams["figure.figsize"] = [12,6]


In [None]:
# Plotting Silhouette Score for a range of ks
import matplotlib.pyplot as plt
plt.plot(ks, sils, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('Silhouette Coefficient')
plt.xticks(ks)
plt.show()
plt.savefig('inertia.png')
plt.rcParams["figure.figsize"] = [12,6]

## Part 3 - Dimensionality reduction using PCA and NMF


### PCA

In an attempt to overcome the curse of dimensionality, PCA is implemented to reduce the dimensions of the data from 25088 to a smaller number of principal components.

- This results in an increase in computational efficiency, however, the Silhouette score remains close to 0, indicating poor clustering.

- Increasing the number of components until explained variance is near 100% does not improve clustering performance either.

In [None]:
# PCA for dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

norm = Normalizer()
pca = PCA(n_components = 50)
pipeline = make_pipeline(norm, pca)
pca_comp = pipeline.fit_transform(sub_feat)

In [None]:
# k-means
from sklearn import preprocessing
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, init='k-means++').fit(pca_comp)
labels = kmeans.labels_

In [None]:
# metrics
from sklearn import metrics
metrics.silhouette_score(sub_feat, labels)
print(metrics.silhouette_score(sub_feat, labels, metric='euclidean'))
print(kmeans.inertia_)

In [None]:
# explained variance
pca_variance = pca.explained_variance_ratio_
sum(pca_variance)

In [None]:
# plot of explained variance for the PCA components
import matplotlib.pyplot as plt
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_)
plt.xlabel('PCA feature')
plt.ylabel('Explained variance')
plt.xticks(features)
plt.show()


plt.rcParams["figure.figsize"] = [10,5]

In [None]:
# kmeans with a range of k's and saving inertia and silhouette score
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn import metrics

ks = range(5,26)
inertias_pca = []
sils_pca = []
norm = Normalizer()
pca = PCA(n_components = 50)

for k in ks:
    kmeans = KMeans(n_clusters = k,init='k-means++')
    pipeline = make_pipeline(norm,pca,kmeans)
    pipeline.fit(sub_feat)
    inertia = pipeline.named_steps['kmeans'].inertia_
    inertias_pca.append(inertia)
    labels = kmeans.labels_
    sils_pca.append(metrics.silhouette_score(sub_feat, labels, metric='euclidean'))
    print(k)

In [None]:
# Plotting inertia over a range of k
import matplotlib.pyplot as plt
plt.plot(ks,inertias_pca, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
plt.savefig('inertia.png')
plt.rcParams["figure.figsize"] = [12,6]

In [None]:
print(sils_pca)

In [None]:
# Plotting inertia over a range of k
import matplotlib.pyplot as plt
plt.plot(ks, sils_pca, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('Silhouette Coefficient')
plt.xticks(ks)
plt.show()
plt.savefig('inertia.png')
plt.rcParams["figure.figsize"] = [12,6]

### NMF
Since PCA did not improve clustering performance, NMF was implemented as a second attempt. Since the values of the matrix are all non-negative, this as approach which, compared to PCA, preserves interpretibility of the components. For instance, NMF represents images as combinations of common patterns (Wilson, 2018). Hence, even though the raw image features are abstract, the NMF components can be seen as topics or clusters. 

Result: NMF did not improve clustering performance either.

In [None]:
# NMF
from sklearn.decomposition import NMF
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

nmf = NMF(n_components = 50)
norm = Normalizer()
pipeline = make_pipeline(nmf, norm)
nmf_feat = pipeline.fit_transform(sub_feat)

In [None]:
# k-means
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, init='k-means++').fit(nmf_feat)
labels = kmeans.labels_

In [None]:
# metrics
print(metrics.silhouette_score(sub_feat, labels, metric='euclidean'))
print(kmeans.inertia_)

In [None]:
# kmeans with a range of k's and saving inertia and silhouette score
from sklearn.cluster import KMeans
from sklearn import metrics

ks = range(5,26)
inertias_nmf = []
sils_nmf = []

for k in ks:
    kmeans = KMeans(n_clusters = k).fit(nmf_feat)
    inertia = kmeans.inertia_
    inertias_nmf.append(inertia)
    labels = kmeans.labels_
    sils_nmf.append(metrics.silhouette_score(sub_feat, labels, metric='euclidean'))
    print(k)

In [None]:
# Plotting inertia over a range of k
import matplotlib.pyplot as plt
plt.plot(ks,inertias_nmf, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
plt.savefig('inertia.png')
plt.rcParams["figure.figsize"] = [12,6]

In [None]:
print(sils_nmf)

In [None]:
# Plotting inertia over a range of k
import matplotlib.pyplot as plt
plt.plot(ks, sils_nmf, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('Silhouette Coefficient')
plt.xticks(ks)
plt.show()
plt.savefig('inertia.png')
plt.rcParams["figure.figsize"] = [12,6]

## Part 4 - Poster to poster similarity

### Cosine similarity matrix


In [None]:
# function to calculate cosine similarity matrix
import numpy as np
def cosine_similarity(features):
    sim = features.dot(features.T)
    if not isinstance(sim, np.ndarray):
        sim = sim.toarray()
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [None]:
# calculte similarity matrix 
sim = cosine_similarity(feat_csr)

In [None]:
# save sim matrix to disk
import numpy as np
np.save('sim_file',sim)

In [None]:
# import sim matrix
import numpy as np
sim = np.load('sim_file.npy')

In [None]:
# convert to dataframe
import pandas as pd
sim_df = pd.DataFrame(sim)
sim_df.shape

In [None]:
# find a given movie using the imdb ID get the index in the ID list.
poster_ids.index('848228')

In [None]:
# Display posters with highest cosine similarity
poster = sim_df.loc[36709]
n_sim = poster.nlargest(n = 6)
n_sim_d = pd.DataFrame(n_sim)
n_sim_d

In [None]:
# generate a list of posters paths for the n most similar posters
n_sim_poster = []

for i in n_sim_d.index:
    n_sim_poster.append(poster_paths[i])

In [None]:
# print n most similar poster
%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

for i in n_sim_poster:
    img = mpimg.imread(i)
    imgplot = plt.imshow(img)
    plt.show()