# Data Mining: Exercise 5.2 (All images analysis)
Michael Quinlan
SN: 6901824

## **Research Question**
Can we predict the popularity of a movie (i.e., the IMDB rating of a movie) solely on the basis of its poster?

In [6]:
# load all needed dependencies and libraries
!pip install wget
import io
import os
import cv2
import wget
import urllib
#import webcolors
import colorgram
import numpy as np
import pandas as pd
import matplotlib as mpl
import tensorflow
from tqdm.notebook import tqdm
from PIL import Image, ImageOps
import matplotlib.pyplot as plt
from matplotlib.colors import to_hex
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
%config InlineBackend.figure_format='retina'



In [8]:
# change working directory and define path to movie poster images stored on local drive

#os.chdir(r"C:\DMTIV\Modules\Week5Images\output")
ds_path = r'MovieGenre.csv'

In [9]:
# define a function to extract a sample of 500 movie poster images from local drive and
# creates and returns a dataframe to store metadata on movie posters

"""
Input: file path to .csv file with stored images of movie posters
Output: dataframe containing metadata on images of movie posters
"""

def get_movie_poster_sample(csv_path, output_dir, sample_size=500):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    df = pd.read_csv(csv_path, sep=',', engine='python').dropna()
    sample = df.sample(sample_size, random_state=42)
    def_rows = []
    output_paths = []
    for row in tqdm(sample.itertuples(), total=sample_size):
        poster = row.Poster
        poster = poster[:poster.rfind('@')]
        poster_high_res = poster + '@._V1_FMjpg_UX500_.jpg'
        try:
            output_path = os.path.join(output_dir, str(row.imdbId) + '.jpg')
            if not os.path.isfile(output_path): # check to see if file exists
                urllib.request.urlretrieve(poster_high_res, output_path) # go to url to retrieve poster
        except:
            continue
        output_paths.append(output_path)
        def_rows.append(row)
    final_sample_df = pd.DataFrame(def_rows)
    final_sample_df['Path'] = output_paths
    final_sample_df = final_sample_df.rename(columns={'_4': 'IMDB_rating', '_2': 'Uri'})
    return final_sample_df 

In [10]:
# call the function to retrieve movie poster images from local drive and store in dataframe

movie_poster_df = get_movie_poster_sample(ds_path, r'MoviePosters')

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [11]:
# define a function that retrieves image from local drive

def load_image_from_path(image_path, target_size=None, color_mode='rgb'):
    pil_image = image.load_img(image_path, 
                               target_size=target_size,
                            color_mode=color_mode)
    return image.img_to_array(pil_image)

In [12]:
# extract movie titles from dataframe

"""
input: movie data frame, title column
output: list of movie names for each movie poster image
"""
movie_titles = []

for path in range(0, len(movie_poster_df)): # iterate over all rows in movie dataframe
    movie_title = movie_poster_df.Title[path] # extract movie titles
    movie_titles_split = movie_title.split("(") # split title from date
    movie_titles.append(movie_titles_split[0]) # store titles in list

In [13]:
len(movie_titles)

473

In [14]:
# retrieve face detection model fron github and store as 'face_model'

face_model_url = r"https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml"
face_model = wget.download(face_model_url) # store face model in variable

In [15]:
# define a function that uses face detection model to identify number of faces in each 
# movie poster image and storse the result in a list

"""
Input: list of file pathnames for each movie poster image stored on local drive
Output: list that contains the number of identified faces in each movie poster
"""

def multiProcessing(pathnames, model):
    img_path = [] # create empty list of image paths
    n_faces_list = [] # create empty list of faces
    
    face_classification = cv2.CascadeClassifier(face_model) # load the classifier (models only need to be loaded once)
    
    for path in tqdm(pathnames):              
        pre_image = load_image_from_path(path, color_mode='grayscale')
        gray_image = np.squeeze(pre_image).astype('uint8')
        faces = face_classification.detectMultiScale(gray_image, 1.3, 5) # detect the faces 
        n_faces = len(faces) # get the number of faces       
        n_faces_list.append(n_faces) # add number of faces for each movie poster image to list
        
        path_last_part = os.path.basename(path)
        img_path.append(path_last_part)     
        
    return n_faces_list   

In [16]:
# call the face detection model
# instantiate pathnames variable with all movie poster file pathnames

pathnames = movie_poster_df.Path
n_faces_list = multiProcessing(pathnames, face_model) # store number of faces per image in a list 

HBox(children=(FloatProgress(value=0.0, max=473.0), HTML(value='')))




In [17]:
# upload the gender identification model and assign it to a variable

gender_url = r"https://github.com/oarriaga/face_classification/raw/master/trained_models/gender_models/gender_mini_XCEPTION.21-0.95.hdf5"
gender_model = wget.download(gender_url)

In [18]:
# define a function that identifies bounding boxes of faces

def apply_offsets(face_coordinates, offsets):
    """
    Derived from https://github.com/oarriaga/face_classification/blob/
    b861d21b0e76ca5514cdeb5b56a689b7318584f4/src/utils/inference.py#L21
    """
    x, y, width, height = face_coordinates
    x_off, y_off = offsets
    return (x - x_off, x + width + x_off, y - y_off, y + height + y_off)

In [19]:
# call above function to create bounnding boxes around faces
# use face_model and gender_model to identify gender of faces detected in each movie poster image
# store number of males and females in separate lists

face_classification = cv2.CascadeClassifier(face_model) # load the classifier (models only need to be loaded once)
gender_classifier = load_model(gender_model)

GENDER_OFFSETS = (10, 10)
INPUT_SHAPE_GENDER = gender_classifier.input_shape[1:3]

labels = ['woman', 'man']

n_males_list = []
n_females_list = []

for path in tqdm(pathnames):              
    pre_image = load_image_from_path(path, color_mode='grayscale')
    gray_image = np.squeeze(pre_image).astype('uint8')
    faces = face_classification.detectMultiScale(gray_image, 1.3, 5) # detect the faces
    gender_list = []
    
    for face_coordinates in faces: # using the output of the CascadeClassifier
        x1, x2, y1, y2 = apply_offsets(face_coordinates, GENDER_OFFSETS) # extends the bounding box
        face_img = gray_image[y1:y2, x1:x2] # only get the face 
        try:
            face_img = cv2.resize(face_img, (INPUT_SHAPE_GENDER)) # resize the image
        except:
            continue
        face_img = face_img.astype('float32') / 255.0 # preprocess the image
        face_img = np.expand_dims(face_img, 0) # batch of one
        probas = gender_classifier.predict(face_img) 
        gender_list.append(labels[np.argmax(probas[0])]) # for one image
    count_male = gender_list.count('man')
    count_female = gender_list.count('woman')
    n_males_list.append(count_male)
    n_females_list.append(count_female)



HBox(children=(FloatProgress(value=0.0, max=473.0), HTML(value='')))




ValueError: Error when checking input: expected input_1 to have 4 dimensions, but got array with shape (1, 64, 64)

In [20]:
# loop through all images and use model to retrieve median values for
# red, green and blue colors
# store median values for all images in individual lists

median_r_list = []
median_g_list = []
median_b_list = []

for path in tqdm(pathnames):
    pre_image = load_image_from_path(path, color_mode='grayscale')
    gray_image = np.squeeze(pre_image).astype('uint8')
    color_image = load_image_from_path(path, color_mode='rgb')
    img = Image.fromarray(color_image.astype(np.uint8))
    rgb_array = np.median(img, axis=(0,1)) #create numeric array storing median values 
    median_r_list.append(rgb_array[0]) # append median 'red' values for all images in list
    median_g_list.append(rgb_array[1]) # append median 'green' values for all images in list
    median_b_list.append(rgb_array[2]) # append median 'blue' values for all images in list

HBox(children=(FloatProgress(value=0.0, max=473.0), HTML(value='')))




In [21]:
df_movie_posters = pd.DataFrame()

df_movie_posters['Movie'] = movie_titles
df_movie_posters['IMDB_Rating'] = movie_poster_df.IMDB_rating
df_movie_posters['n_Faces'] = n_faces_list 
df_movie_posters['n_Men'] = n_males_list
df_movie_posters['n_Women'] = n_females_list
df_movie_posters['Median_Red'] = median_r_list
df_movie_posters['Median_Green'] = median_g_list
df_movie_posters['Median_Blue'] = median_b_list

df_movie_posters

ValueError: Length of values (0) does not match length of index (473)

In [22]:
df_movie_posters.to_csv("MovieDF.csv", encoding='utf-8', index=False)