### Scraping data from Imdb website

The objective of this experiment is to obtain movies and tv shows of all available languages and widely used genres using BeautifulSoup.

#### Import necessary libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

#### Link to scrap data from

In [2]:
comedy = "https://www.imdb.com/search/title/?genres=comedy"
scifi = "https://www.imdb.com/search/title/?genres=sci-fi"
horror = "https://www.imdb.com/search/title/?genres=horror"
romance = "https://www.imdb.com/search/title/?genres=romance"
action = "https://www.imdb.com/search/title/?genres=action"
thriller = "https://www.imdb.com/search/title/?genres=thriller"
drama = "https://www.imdb.com/search/title/?genres=drama"
mystery = "https://www.imdb.com/search/title/?genres=mystery"
crime = "https://www.imdb.com/search/title/?genres=crime"
fantasy = "https://www.imdb.com/search/title/?genres=fantasy"
adventure = "https://www.imdb.com/search/title/?genres=adventure"

#### Get Page


In [3]:
page = requests.get(comedy)

In [4]:
html_soup = BeautifulSoup(page.text, 'html.parser')

In [5]:
html_soup_scifi = BeautifulSoup(requests.get(scifi).text, 'html.parser')

In [6]:
html_soup_horror = BeautifulSoup(requests.get(horror).text, 'html.parser')

In [7]:
html_soup_romance = BeautifulSoup(requests.get(romance).text, 'html.parser')

In [8]:
html_soup_action = BeautifulSoup(requests.get(action).text, 'html.parser')

In [9]:
html_soup_thriller= BeautifulSoup(requests.get(thriller).text, 'html.parser')

In [10]:
html_soup_drama = BeautifulSoup(requests.get(drama).text, 'html.parser')

In [11]:
html_soup_mystery = BeautifulSoup(requests.get(mystery).text, 'html.parser')

In [12]:
html_soup_crime = BeautifulSoup(requests.get(crime).text, 'html.parser')

In [13]:
html_soup_fantasy = BeautifulSoup(requests.get(fantasy).text, 'html.parser')

In [14]:
html_soup_adventure = BeautifulSoup(requests.get(adventure).text, 'html.parser')

#### Data to be scraped

In [15]:
cards = html_soup.find_all('div', class_='lister-item-content') 

### Features to be extracted

In [16]:
movie_show_name = []
year = []
runtime = []
genre = []
rating = []
description = []
directors = []
actors = []
votes = []

##### Total number of movies 

Comedy

In [17]:
total_titles = str(re.search('of (.+?) titles', html_soup.find('div', class_='desc').span.text).group(1))
total_titles = int(''.join(total_titles.split(',')))
total_titles                  

1568756

Sci-Fi

In [18]:
total_count_scifi = str(re.search('of (.+?) titles', html_soup_scifi.find('div', class_='desc').span.text).group(1))
total_count_scifi = int(''.join(total_count_scifi.split(',')))
total_count_scifi     

177281

Horror

In [19]:
total_count_horror = str(re.search('of (.+?) titles', html_soup_horror.find('div', class_='desc').span.text).group(1))
total_count_horror = int(''.join(total_count_horror.split(',')))
total_count_horror

148777

Romance

In [20]:
total_count_romance = str(re.search('of (.+?) titles', html_soup_romance.find('div', class_='desc').span.text).group(1))
total_count_romance = int(''.join(total_count_romance.split(',')))
total_count_romance

786255

Action

In [21]:
total_count_action = str(re.search('of (.+?) titles', html_soup_action.find('div', class_='desc').span.text).group(1))
total_count_action = int(''.join(total_count_action.split(',')))
total_count_action

303111

Thriller

In [22]:
total_count_thriller = str(re.search('of (.+?) titles', html_soup_thriller.find('div', class_='desc').span.text).group(1))
total_count_thriller = int(''.join(total_count_thriller.split(',')))
total_count_thriller

244118

Drama

In [23]:
total_count_drama = str(re.search('of (.+?) titles', html_soup_drama.find('div', class_='desc').span.text).group(1))
total_count_drama = int(''.join(total_count_drama.split(',')))
total_count_drama

2086227

Mystery

In [24]:
total_count_mystery = str(re.search('of (.+?) titles', html_soup_mystery.find('div', class_='desc').span.text).group(1))
total_count_mystery = int(''.join(total_count_mystery.split(',')))
total_count_mystery

207951

Crime

In [25]:
total_count_crime = str(re.search('of (.+?) titles', html_soup_crime.find('div', class_='desc').span.text).group(1))
total_count_crime = int(''.join(total_count_crime.split(',')))
total_count_crime

326555

Fantasy

In [26]:
total_count_fantasy = str(re.search('of (.+?) titles', html_soup_fantasy.find('div', class_='desc').span.text).group(1))
total_count_fantasy = int(''.join(total_count_fantasy.split(',')))
total_count_fantasy

234863

Adventure

In [27]:
total_count_adventure = str(re.search('of (.+?) titles', html_soup_adventure.find('div', class_='desc').span.text).group(1))
total_count_adventure = int(''.join(total_count_adventure.split(',')))
total_count_adventure

275020

#### Sample data and processing

##### Movie/Show Name

In [28]:
name = cards[0].h3.a.text #name
name

"Schitt's Creek"

##### Movie/Show Year(s)

Reformatting scraped string to remove brackets

In [29]:
years = cards[0].h3.find('span', class_='lister-item-year text-muted unbold').text #year
years

'(2015–2020)'

In [30]:
# Remove brackets
years = re.sub('\(|\)|', '', years)
years

'2015–2020'

In [31]:
years = ' - '.join(years.split('–'))
years

'2015 - 2020'

##### Runtime

Extract the numerical part (i.e., duration in minutes) from the text

In [32]:
run_time = cards[0].p.find('span', class_='runtime').text #runtime
run_time

'22 min'

In [33]:
run_time = int(run_time.split(' ')[0])
run_time

22

##### Genres

In [34]:
genre_list = str(cards[1].p.find('span', class_='genre').text)
genre_list

'\nAction, Comedy, Crime            '

In [35]:
genre_list = re.sub('\s+', '', genre_list)
genre_list

'Action,Comedy,Crime'

##### Ratings

Extracting data value from the nested div

In [36]:
movie_rating = cards[0].find('div', attrs = {'name':'ir'}) #ratings
movie_rating

<div class="inline-block ratings-imdb-rating" data-value="8.5" name="ir">
<span class="global-sprite rating-star imdb-rating"></span>
<strong>8.5</strong>
</div>

In [37]:
movie_rating['data-value']

'8.5'

In [38]:
movie_rating = float(movie_rating['data-value'])
movie_rating

8.5

##### Description

Remove unwanted tab spaces and new line characters from the text

In [39]:
description_text = cards[0].find_all('p', class_='text-muted')[1].text # description
description_text

"\n    When rich video-store magnate Johnny Rose and his family suddenly find themselves broke, they are forced to leave their pampered lives to regroup in Schitt's Creek."

In [40]:
description_text = re.sub('\s+', ' ', description_text)
description_text

" When rich video-store magnate Johnny Rose and his family suddenly find themselves broke, they are forced to leave their pampered lives to regroup in Schitt's Creek."

##### Actors

Extract all the actors names from parsed html

In [41]:
all_actors = cards[0].find('p', class_='').find_all('a') # actors
all_actors

[<a href="/name/nm0506405/">Eugene Levy</a>,
 <a href="/name/nm0001573/">Catherine O'Hara</a>,
 <a href="/name/nm2391794/">Dan Levy</a>,
 <a href="/name/nm2251884/">Annie Murphy</a>]

In [42]:
for each_actor in all_actors:
    print(each_actor.text)

Eugene Levy
Catherine O'Hara
Dan Levy
Annie Murphy


##### Votes

Get total number of votes casted for a movie

In [43]:
user_votes = cards[0].find('span', attrs = {'name':'nv'}) 
user_votes

<span data-value="54764" name="nv">54,764</span>

In [44]:
user_votes = int(user_votes['data-value'])
user_votes

54764

#### Function to get data from a single page (Top 51)

In [45]:
def movieData(movie_list):
    for each_movie in movie_list:
        
        # Movie or show name
        movie_show_name.append(each_movie.h3.a.text)
        
        # Movie or show years
        years = each_movie.h3.find('span', class_='lister-item-year text-muted unbold').text
        years = re.sub('\(|\)', '', years)
        years = ' - '.join(years.split('–'))
        years = re.sub('[A-Za-z]+', '', years) # Removes any alphabets
        year.append(years)
        
        # Runtime
        if each_movie.p.find('span', class_='runtime'):
            run_time = each_movie.p.find('span', class_='runtime').text
            run_time = int(''.join(run_time.split(',')).split(' ')[0])
            runtime.append(run_time) 
        else:
            runtime.append("Not available")
        
        # Genres
        genre_list = str(each_movie.p.find('span', class_='genre').text)
        genre_list = re.sub('\s+', '', genre_list)
        genre.append(genre_list)
        
        # Rating
        if each_movie.find('div', attrs = {'name':'ir'}):
            movie_rating = each_movie.find('div', attrs = {'name':'ir'})
            movie_rating = float(movie_rating['data-value'])
            rating.append(movie_rating)
        else:
            rating.append(0.0)
        
        # Description
        description_text = each_movie.find_all('p', class_='text-muted')[1].text
        description_text = re.sub('\s+', ' ', description_text)
        description.append(description_text)
        
        # Directors and Actors 
        directors_or_actors = re.sub('\s+', ' ', str(each_movie.find_all('p', class_='')))
        
        # Director name is available
        if re.search("Director:|Directors:", directors_or_actors):
            
            try:
                movie_directors = BeautifulSoup(re.search("Director(.+?)</span>", directors_or_actors).group(1)).find_all('a')
                
                director_list = []
                # If more than one directors are available
                for each_director in movie_directors:
                    director_list.append(each_director.text)

                directors.append(', '.join(director_list))
                
            except:
                directors.append('Not Available')         
        else:
            directors.append("Data Unavailable")
        
        # Actor names is available
        if re.search("Stars:(.+?)", directors_or_actors):
            movie_stars = BeautifulSoup(re.search("Stars:(.+?)</p>", directors_or_actors).group(1)).find_all('a')
            
            actor_list = []
            for each_actor in movie_stars:
                actor_list.append(each_actor.text)
            actors.append(', '.join(actor_list))
        else:
            actors.append("Data Unavailable")                                    
                                    
        # Votes
        user_votes = each_movie.find('span', attrs = {'name':'nv'})
        if user_votes:
            votes.append(int(user_votes['data-value']))
        else:
            votes.append(0)

### Fetching Data from Webpage

#### Data from Comedy section

In [46]:
page_num = 1

In [None]:
while page_num < total_titles:
    url = 'https://www.imdb.com/search/title/?genres=comedy&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50

#### Data from Sci-Fi section

In [48]:
page_num = 1

In [None]:
while page_num < total_count_sci_fi:
    url = 'https://www.imdb.com/search/title/?genres=sci-fi&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

#### Data from Horror section

In [49]:
page_num = 1

In [None]:
while page_num < total_count_horror:
    url = 'https://www.imdb.com/search/title/?genres=horror&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50

#### Data from Romance section

In [50]:
page_num = 1

In [None]:
while page_num < total_count_romance:
    url = 'https://www.imdb.com/search/title/?genres=romance&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

#### Data from Action section

In [51]:
page_num = 1

In [None]:
while page_num < total_count_action:
    url = 'https://www.imdb.com/search/title/?genres=action&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

#### Data from Thriller section

In [52]:
page_num = 1

In [None]:
while page_num < total_count_thriller:
    url = 'https://www.imdb.com/search/title/?genres=thriller&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

#### Data from Drama section

In [53]:
page_num = 1

In [None]:
while page_num < total_count_drama:
    url = 'https://www.imdb.com/search/title/?genres=drama&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

#### Data from Mystery section

In [54]:
page_num = 1

In [None]:
while page_num < total_count_mystery:
    url = 'https://www.imdb.com/search/title/?genres=mystery&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

#### Data from Crime section

In [55]:
page_num = 1

In [None]:
while page_num < total_count_crime:
    url = 'https://www.imdb.com/search/title/?genres=crime&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

#### Data from Fantasy section

In [56]:
page_num = 1

In [None]:
while page_num < total_count_fantasy:
    url = 'https://www.imdb.com/search/title/?genres=fantasy&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

#### Data from Adventure section

In [57]:
page_num = 1

In [None]:
while page_num < total_count_adventure:
    url = 'https://www.imdb.com/search/title/?genres=adventure&start='+ str(page_num) + '&ref_=adv_nxt'
    html_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    cards = html_soup.find_all('div', class_='lister-item-content') 
    movieData(cards)
    page_num += 50
    print(page_num)

### Make a Dataframe

In [58]:
data = pd.DataFrame({
    'name': movie_show_name,
    'year': year,
    'duration_in_minutes': runtime,
    'genres': genre,
    'rating': rating,
    'description': description,
    'directors': directors,
    'starred_by': actors,
    'votes': votes
})

### Store data into CSV file

In [59]:
data.to_csv('Movies.csv', index=False, encoding='utf-8-sig')

<center>Prepared by J.Haripriya</center>