In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
from dateutil.parser import parse
from datetime import datetime
import pandas as pd
import re
import numpy as np

In [2]:
def tryGetGenre(bsobj):
    try:
        return [genre.getText() for genre in bsobj.findAll('span', {'class': 'itemprop', 'itemprop': 'genre'})]
    except:
        return np.NAN
    
def tryGetTime(bsobj):
    try:
        return bsobj.find('time', {'itemprop': 'duration'}).getText().strip()
    except:
        return np.NAN

def tryGetDate(bsobj):
    try:
        date = [x for x in bsobj.findAll('h4') if x.getText() == 'Release Date:'][0].parent.getText()
        date = re.findall('[0-9]+.*[0-9][0-9][0-9][0-9]', date) + re.findall('[0-9][0-9][0-9][0-9]', date)
        return date[0]
    except:
        return np.NAN

def tryGetRating(bsobj):
    try:
        return bsobj.find('span', {'itemprop': 'ratingValue'}).getText()
    except:
        return np.NAN
    
def getMovieData(movieUrl):
    print(movieUrl)
    try:
        bsobj = BeautifulSoup(urlopen(movieUrl), 'lxml')
    except:
        return {}
    rating = tryGetRating(bsobj)
    time = tryGetTime(bsobj)
    genres = tryGetGenre(bsobj)
    date = tryGetDate(bsobj)
    return {'date': date, 'rating': rating, 'time': time, 'genres': genres}


In [3]:
movieDataFrame = pd.read_csv('/Users/Joe/Desktop/movies.csv')

In [4]:
movieData = movieDataFrame['Movie Url'].map(getMovieData)

http://www.imdb.com/title/tt1872181/
http://www.imdb.com/title/tt0114369/
http://www.imdb.com/title/tt0114709/
http://www.imdb.com/title/tt0137523/
http://www.imdb.com/title/tt0120915/
http://www.imdb.com/title/tt0357413/
http://www.imdb.com/title/tt0499549/
http://www.imdb.com/title/tt1228705/
http://www.imdb.com/title/tt1104001/
http://www.imdb.com/title/tt0454876/
http://www.imdb.com/title/tt1636826/
http://www.imdb.com/title/tt1735898/
http://www.imdb.com/title/tt1397280/
http://www.imdb.com/title/tt1232200/
http://www.imdb.com/title/tt1351685/
http://www.imdb.com/title/tt2101441/
http://www.imdb.com/title/tt2302755/
http://www.imdb.com/title/tt2099556/
http://www.imdb.com/title/tt1838475/
http://www.imdb.com/title/tt1230215/
http://www.imdb.com/title/tt0795461/
http://www.imdb.com/title/tt1235522/
http://www.imdb.com/title/tt1321870/
http://www.imdb.com/title/tt2024432/
http://www.imdb.com/title/tt2184339/
http://www.imdb.com/title/tt0245429/
http://www.imdb.com/title/tt0266697/
h

In [5]:
def funcOrNan(f):
    def fon(x):
        try:
            return f(x)
        except:
            return np.NAN
    return fon


In [6]:
movieDataFrame['Date'] = movieData.map(funcOrNan(lambda x: x['date']))

In [7]:
movieDataFrame['Length'] = movieData.map(funcOrNan(lambda x: x['time']))

In [8]:
movieDataFrame['Genre'] = movieData.map(funcOrNan(lambda x: x['genres']))

In [9]:
movieDataFrame['IMDb Rating'] = movieData.map(funcOrNan(lambda x: x['rating']))

In [10]:
movieDataFrame

Unnamed: 0,Movie Number,Movie Title,Movie Url,Date,Length,Genre,IMDb Rating
0,2,The Amazing Spider-Man 2,http://www.imdb.com/title/tt1872181/,2 May 2014,2h 22min,"[Action, Adventure, Sci-Fi]",6.7
1,11,Se7en,http://www.imdb.com/title/tt0114369/,22 September 1995,2h 7min,"[Crime, Drama, Mystery]",8.6
2,12,Toy Story,http://www.imdb.com/title/tt0114709/,22 November 1995,1h 21min,"[Animation, Adventure, Comedy]",8.3
3,15,Fight Club,http://www.imdb.com/title/tt0137523/,15 October 1999,2h 19min,[Drama],8.8
4,17,Star Wars: Episode I - The Phantom Menace,http://www.imdb.com/title/tt0120915/,19 May 1999,2h 16min,"[Action, Adventure, Fantasy]",6.5
5,26,Anchorman: The Legend of Ron Burgundy,http://www.imdb.com/title/tt0357413/,9 July 2004,1h 34min,[Comedy],7.2
6,49,Avatar,http://www.imdb.com/title/tt0499549/,18 December 2009,2h 42min,"[Action, Adventure, Fantasy]",7.8
7,54,Iron Man 2,http://www.imdb.com/title/tt1228705/,7 May 2010,2h 4min,"[Action, Adventure, Sci-Fi]",7.0
8,58,TRON: Legacy,http://www.imdb.com/title/tt1104001/,17 December 2010,2h 5min,"[Action, Adventure, Sci-Fi]",6.8
9,94,Life of Pi,http://www.imdb.com/title/tt0454876/,21 November 2012,2h 7min,"[Adventure, Drama, Fantasy]",7.9


In [11]:
def getGenreExclusive(genreList):
    if genreList is np.NaN:
        return 'Undefined'
    if 'Documentary' in genreList or 'Biography' in genreList:
        return 'Documentary/Biography'
    
    elif 'Comedy' in genreList or 'Romance' in genreList:
        if 'Comedy' in genreList and 'Romance' in genreList:
            return 'RomCom'
        elif 'Comedy' in genreList:
            return 'Comedy'
        else:
            return 'Romance'
        
    elif 'Crime' in genreList:
        return 'Crime'
    elif 'Sci-Fi' in genreList or 'Fantasy' in genreList:
        return 'Sci-Fi/Fantasy'
    elif 'Horror' in genreList:
        return 'Horror'
    
    elif 'Action' in genreList or 'Adventure' in genreList:
        if 'Drama' not in genreList:
            return 'Action/Adventure'
        else:
            return 'Undefined'
    
    elif 'Drama' in genreList:
        if 'Action' not in genreList and 'Adventure' not in genreList:
            return 'Drama'
        else:
            return 'Undefined'
    
    else:
        return 'Undefined'

In [12]:
movieDataFrame['Filtered Genre'] = movieDataFrame['Genre'].map(getGenreExclusive)

In [13]:
movieDataFrame.groupby('Filtered Genre')['Filtered Genre'].count()

Filtered Genre
Action/Adventure          30
Comedy                   259
Crime                    115
Documentary/Biography     95
Drama                    111
Horror                    57
RomCom                    85
Romance                   67
Sci-Fi/Fantasy            88
Undefined                 52
Name: Filtered Genre, dtype: int64

In [14]:
def genresToList(genreList):
    try:
        return [word for word in genreList.split('\'') if len(word) > 2]
    except:
        return np.NaN

In [15]:
movieDataFrame['year'] = movieDataFrame['Date'].map(funcOrNan(parse)).map(lambda x: x.year)

In [16]:
def getHours(string):
    try:
        return int(re.findall('[0-9]+h', string)[0][:-1])
    except:
        return 0
    
def getMinutes(string):
    try:
        return int(re.findall('[0-9]+min', string)[0][:-3])
    except:
        return 0

def convertLength(lengthString):
    hours = getHours(lengthString)
    minutes = getMinutes(lengthString)
    return 60 * hours + minutes
    

In [17]:
movieDataFrame['Int Length'] = movieDataFrame['Length'].map(convertLength)

In [18]:
movieDataFrame.head()

Unnamed: 0,Movie Number,Movie Title,Movie Url,Date,Length,Genre,IMDb Rating,Filtered Genre,year,Int Length
0,2,The Amazing Spider-Man 2,http://www.imdb.com/title/tt1872181/,2 May 2014,2h 22min,"[Action, Adventure, Sci-Fi]",6.7,Sci-Fi/Fantasy,2014.0,142
1,11,Se7en,http://www.imdb.com/title/tt0114369/,22 September 1995,2h 7min,"[Crime, Drama, Mystery]",8.6,Crime,1995.0,127
2,12,Toy Story,http://www.imdb.com/title/tt0114709/,22 November 1995,1h 21min,"[Animation, Adventure, Comedy]",8.3,Comedy,1995.0,81
3,15,Fight Club,http://www.imdb.com/title/tt0137523/,15 October 1999,2h 19min,[Drama],8.8,Drama,1999.0,139
4,17,Star Wars: Episode I - The Phantom Menace,http://www.imdb.com/title/tt0120915/,19 May 1999,2h 16min,"[Action, Adventure, Fantasy]",6.5,Sci-Fi/Fantasy,1999.0,136


In [23]:
movieDataFrame.to_csv('MoviesData.csv', index=False)