In [1]:
#libs
from bs4 import BeautifulSoup
import requests
import time 
import datetime
import pandas as pd
import numpy as np

In [2]:
# Connect to Website and pull in data

url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"
headers = {"Accept-Language": "en-US, en;q=0.5"}
results = requests.get(url, headers=headers)


#url - is the variable we create and assign the URL to.

#results - is the variable we create to store our request.get action.

#requests.get(url, headers=headers) - is the method we use to grab the contents of the URL.

#The headers part - tells our scraper to bring us English, based on our previous line of code.

soup = BeautifulSoup(results.text,'html.parser')

#print(soup.prettify())
# Lists for DataSet

names =[]
years =[]
duration=[]
imbd_rating=[]
metascore=[]
votes=[]
gross_inc=[]

movie_div = soup.find_all('div',class_= 'lister-item mode-advanced')

for box in movie_div:
    name=box.h3.a.text
    names.append(name)
    
    year = box.h3.find('span', class_='lister-item-year').text
    years.append(year)
    
    
    length = box.p.find('span',class_='runtime').text if box.p.find('span',class_='runtime') else ''
    duration.append(length)
    
    imdb = float(box.strong.text)
    imbd_rating.append(imdb)
    
    meta=box.find('span',class_='metascore').text if box.find('span',class_='metascore') else ''
    metascore.append(meta)
    
    nv = box.find_all('span', attrs={'name': 'nv'})
    
    vote= nv[0].text
    votes.append(vote)
    
    gross=nv[1].text if len(nv) > 1 else ''
    gross_inc.append(gross)


movies = pd.DataFrame({
'movie': names,
'year': years,
'duration': duration,
'imdb_score': imbd_rating,
'metascore': metascore,
'votes': votes,
'gross_income': gross_inc,
})



    

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie         50 non-null     object 
 1   year          50 non-null     object 
 2   duration      50 non-null     object 
 3   imdb_score    50 non-null     float64
 4   metascore     50 non-null     object 
 5   votes         50 non-null     object 
 6   gross_income  50 non-null     object 
dtypes: float64(1), object(6)
memory usage: 2.9+ KB


In [4]:
movies.head(10)

Unnamed: 0,movie,year,duration,imdb_score,metascore,votes,gross_income
0,Top Gun: Maverick,(2022),130 min,8.5,78,342125,#63
1,The Lord of the Rings: The Fellowship of the Ring,(2001),178 min,8.8,92,1833964,$315.54M
2,Everything Everywhere All at Once,(2022),139 min,8.2,81,193962,#190
3,Thirteen Lives,(2022),147 min,7.8,66,36772,
4,The Batman,(2022),176 min,7.9,72,586786,
5,Spider-Man: No Way Home,(2021),148 min,8.3,71,717708,$804.75M
6,The Godfather,(1972),175 min,9.2,100,1827476,$134.97M
7,Dune,(2021),155 min,8.0,74,606326,$108.33M
8,Get Out,(I) (2017),104 min,7.7,85,589373,$176.04M
9,The Shawshank Redemption,(1994),142 min,9.3,81,2636693,$28.34M


In [None]:
#data pre-proccesing

In [5]:
movies['year']= movies['year'].str.extract('(\d+)').astype(int)

#\d+ stands for all digits in the string

movies['duration']= movies['duration'].str.extract('(\d+)').astype(int)

In [6]:
movies['metascore'] = movies['metascore'].astype(int)
movies['votes'] = movies['votes'].str.replace(',', '').astype(int)
movies['gross_income'] = movies['gross_income'].map(lambda x: x.lstrip('$').rstrip('M'))

movies['gross_income'] = pd.to_numeric(movies['gross_income'], errors='coerce')

In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie         50 non-null     object 
 1   year          50 non-null     int32  
 2   duration      50 non-null     int32  
 3   imdb_score    50 non-null     float64
 4   metascore     50 non-null     int32  
 5   votes         50 non-null     int32  
 6   gross_income  45 non-null     float64
dtypes: float64(2), int32(4), object(1)
memory usage: 2.1+ KB


In [8]:
movies.to_csv('movies_imbd.csv')