# Webscraping IMDB website 


In [1]:
import bs4
import pandas as pd
import requests
import numpy as np

In [2]:
url='https://www.imdb.com/search/title/?&genres=sci_fi'

Fetching the content from the website

In [3]:
result = requests.get(url)

Creating the soup from beautiful soup

In [4]:
soup = bs4.BeautifulSoup(result.text,'lxml')

In [12]:
sci_fi_container = soup.select('.lister-item-content')


To start with we scrape for 1 movie and scale it up with for loop


In [6]:
test_title = sci_fi_container[0].find('h3',class_='lister-item-header')
test_title.a.text

'Moon Knight'

In [7]:
test_year = sci_fi_container[0].find('span',class_='lister-item-year text-muted unbold')
test_year.text.replace(')','').replace('(','')# removing () 

'2022'

In [8]:
test_rating=sci_fi_container[0].find('div',class_='inline-block ratings-imdb-rating')
test_rating['data-value']

'7.6'

In [9]:
test_description = sci_fi_container[0].find_all('p',class_='text-muted')
test_description[1].text.replace("\n",'')

"Steven Grant discovers he's been granted the powers of an Egyptian moon god. But he soon finds out that these newfound powers can be both a blessing and a curse to his troubled life."

In [10]:
test_directior=sci_fi_container[46].find('p',class_='')
test_directior.text.replace('\n','').lstrip()

'Director:Scott Derrickson|     Stars:Benedict Cumberbatch, Chiwetel Ejiofor, Rachel McAdams, Benedict Wong'

Extracting directors and star cast from the string

In [13]:
test_directior=sci_fi_container[0].find('p',class_='')
string = test_directior.text.replace('\n','').lstrip() # remove \n and empty spaces
Stars_index = string.index('Stars')# index of Stars
Stars = string[Stars_index+6::]# slicing from stars
print(Stars) 
# print(string)
line_index=string.index('|')
Director=string[9:line_index]
print(Director)


Oscar Isaac, Ethan Hawke, May Calamawy, F. Murray Abraham


NameError: name 'line_index' is not defined

In [None]:
test_rating=sci_fi_container[0].find('span',class_='certificate')
test_rating.text

'UA'

In [None]:
test_runtime = sci_fi_container[0].find('span',class_='runtime')
test_runtime.text[:-4]

'156'

In [None]:
test_genre = sci_fi_container[0].find('span',class_='genre')
test_genre.text.replace('\n','').rstrip()

'Action, Adventure, Fantasy'

Now we have checked for a particular movie, let's make it for 50 movies and replace missing values with 'nan'(not a  number) 

In [None]:
#initialize empty lists to store the variables scraped
titles = []
imdb_rating = []
movie_ratings = []
cast = []
runtime = []
movie_genre = []
movie_director = []
description = []
movie_year = []

for movie in sci_fi_container:
    #Title
    title = movie.find('h3',class_='lister-item-header')
    titles.append(title.a.text)
    
    # Year
    year = movie.find('span',class_='lister-item-year text-muted unbold')
    movie_year.append(year.text.replace(')','').replace('(','').replace('– ',''))#removing '()' and '-'
    
    #movie rating
    # try and except block for handling null values
    try:
        rating = movie.find('span',class_='certificate')
        movie_ratings.append(rating.text)
    except:
        movie_ratings.append(np.nan)

    #genre
    genre = movie.find('span',class_='genre')
    movie_genre.append(genre.text.replace('\n','').rstrip())
    
    #runtime
    try:
        time = movie.find('span',class_='runtime')
        runtime.append(int(time.text[:-4]))
    except:
        runtime.append(np.nan)
        
    # movie director
    temp = movie.find('p',class_='')
    string = temp.text.replace('\n','').lstrip() # remove \n and empty spaces
    try:
        line_index=string.index('|')
        Director=string[9:line_index]
        movie_director.append(Director)
    except:
        movie_director.append(np.nan)
    
    #cast
    stars_index = string.index('Stars')# index of Stars
    stars = string[stars_index+6::]# slicing from stars
    cast.append(stars)
    
    #plot
    plot = movie.find_all('p',class_='text-muted')
    description.append(plot[1].text.replace("\n",''))
    
    #star rating
    try:
        imdb_score = movie.find('div',class_='inline-block ratings-imdb-rating')
        imdb_rating.append(imdb_score['data-value'])
    except:
        imdb_rating.append(np.nan)

Creating dataframe and adding movies into it


In [None]:
movies_df = pd.DataFrame({'movie':titles,'plot':description,'year':movie_year,'ratings':movie_ratings,
                          'stars':imdb_rating,'runtime':runtime,'genre':movie_genre,'director':movie_director,'cast':cast})
print(movies_df.info())
movies_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   movie     50 non-null     object 
 1   plot      50 non-null     object 
 2   year      50 non-null     object 
 3   ratings   43 non-null     object 
 4   stars     42 non-null     object 
 5   runtime   41 non-null     float64
 6   genre     50 non-null     object 
 7   director  21 non-null     object 
 8   cast      50 non-null     object 
dtypes: float64(1), object(8)
memory usage: 3.6+ KB
None


Unnamed: 0,movie,plot,year,ratings,stars,runtime,genre,director,cast
0,Eternals,"The saga of the Eternals, a race of immortal b...",2021,UA,6.9,156.0,"Action, Adventure, Fantasy",Chloé Zhao,"Gemma Chan, Richard Madden, Angelina Jolie, Sa..."
1,Arcane: League of Legends,Set in utopian Piltover and the oppressed unde...,2021,16,9.4,,"Animation, Action, Adventure",,"Kevin Alejandro, Jason Spisak, Hailee Steinfel..."
2,Dune: Part One,Feature adaptation of Frank Herbert's science ...,2021,UA,8.2,155.0,"Action, Adventure, Drama",Denis Villeneuve,"Timothée Chalamet, Rebecca Ferguson, Zendaya, ..."
3,Shang-Chi and the Legend of the Ten Rings,"Shang-Chi, the master of weaponry-based Kung F...",2021,UA,7.7,132.0,"Action, Adventure, Fantasy",Destin Daniel Cretton,"Simu Liu, Awkwafina, Tony Chiu-Wai Leung, Ben ..."
4,Finch,"On a post-apocalyptic earth, a robot, built to...",2021,UA 13+,7.0,115.0,"Adventure, Drama, Sci-Fi",Miguel Sapochnik,"Tom Hanks, Caleb Landry Jones, Marie Wagenman,..."


Saving it as a csv file

In [None]:
movies_df.to_csv('imdb_movies.csv')