# Image Cleaning 

## Jade Benson

In this notebook, I demonstrate how the movie posters were scraped from links and prepared for analysis. 

In [None]:
import pandas as pd 
import numpy as np 
from skimage.io import imshow, imsave, imread
from skimage.color import rgb2hsv
from PIL import Image
import random 
import requests


In [None]:
#this dataset includes links to posters of movies on wikipedia and their plot summaries 
movies_df = pd.read_csv('movies.csv')
len(movies_df) #15389

In [None]:
#poster scraping 

#can directly download posters without saving locally 

import requests

def collect_posters(image_url, array_list):

  r = requests.get(image_url, stream = True)

  if r.status_code == 200:
      # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
      r.raw.decode_content = True
      poster = Image.open(r.raw)
      poster_rbg = poster.convert('RGB')
      poster_array = np.array(poster_rbg)
      array_list.append(poster_array)

  else:
    print('Image Couldn\'t be retreived')
    array_list.append(None)


In [None]:
poster_links = list(movies_df['Poster']) 

poster_array_list = []

for p in poster_links:

  #if nan (missing poster links)
  if isinstance(p, float) == True and np.isnan(p) == True: 
    poster_array_list.append(None)

  else: 
    collect_posters(p, poster_array_list)


In [None]:
#non-missing posters 

no_nones = [[x] for x in poster_array_list if x is not None]

In [None]:
#format this correctly for the models
formatted_posters = np.concatenate(no_nones, axis=0)


In [None]:
#save this
np.save('formatted_posters', formatted_posters, allow_pickle=True)

Now we need to figure out which observations to drop from dataframe from the indexes in the poster_array_list. 

In [None]:
#create list that is 1 if a poster exists and None if it doesn't exist in poster_array_list 

poster_indicator = []
for x in poster_array_list: 
    if x is None: 
        poster_indicator.append(None)
    else: 
        poster_indicator.append(1)
        

In [None]:
movies_df["poster_indicator"] = poster_indicator


In [None]:
#remove those that don't have posters 

movies_and_posters = movies_df.dropna(axis = 0, subset = ['poster_indicator'])
len(movies_and_posters) #13685 (about 2,000 couldn't be collected)

In [None]:
movies_and_posters.to_csv('movies_and_posters.csv')