In [20]:
import bs4              # to scrape our data
import pandas as pd     # to store our scraped data
import requests         # to fetch the contents of our target website
import numpy as np      # to handle Not a Number values
import csv              # to save our pandas DataFrame to a file
import time             # to prevent getting timedout by IMBd

In [28]:
df = pd.DataFrame(columns=["name", "stars", "date", "author", "review_text", "url", "usefulness"])
URL_MOVIES_250 = "https://www.imdb.com/search/title/?groups=top_1000&view=simple&sort=user_rating,desc&count=250&start=0"

In [29]:
def get_movie_info(the_url, frame):
    response = requests.get(the_url) # make a request to the url
    if response.status_code != 200: # anything other than to success code 200 should halt the program
        print(f"GET failed with response code: {response.status_code}")
        raise
    
    # create a BS4 element tree that we can traverse based on the response test according to the html5 library
    m_soup = bs4.BeautifulSoup(response.text, 'html.parser')
    
    # exract the name of the movie by identifying it by the h3 tag with an itemprop attribute that has the value 'name' by getting the text contents of the element
    name = m_soup.find('h3', {'itemprop': 'name'}).contents[1].text

    # find all the divs containing reviews 
    reviews = m_soup.find_all('div', {'class': 'imdb-user-review' })

    # iterate over all the review divs
    for rev in reviews:

        # 1. attempt to extract a star rating if one is provided in the review
        try:
            # find the span containing the score out of 10 that the user gave the movie and extract the text value of that element
            stars = int(rev.find('span', {'class': 'rating-other-user-rating'}).contents[3].get_text())
        except: # not all reviews have stars, so if the span does not exist, we'll just enter a NaN value
            stars = np.NaN
        
        # 2. Store the date that the review was posted
        date = rev.find('span', {'class': 'review-date'}).text.encode('ascii','ignore').decode()
        
        # 3. Store the username of the author of the review 
        # note that we encode and decode the following text fields to ensure that they're in a tractable format 
        author = rev.find('span', {'class': 'display-name-link'}).contents[0].text.encode('ascii','ignore').decode()

        # 3. Store the text of the review itself
        review_text = rev.find('div', {'class': 'content'}).contents[1].text.encode('ascii','ignore').decode()
        
        # 4. Store the url of the review itself
        url = f"imdb.com{rev.find('a', {'class': 'title'}).attrs['href']}"
        
        # 4. Store usefulness rating 
        usefulness = rev.find('div', {'class': 'actions'}).text.split('.')[0].strip()
        usefulness = usefulness.split(' ')
        usefulness = f"{usefulness[0]}/{usefulness[3]}".encode('ascii','ignore').decode()

        # append the above variables to the DataFrame we passed into the function
        frame = frame.append({'name': name, 'stars': stars, 'date': date, 'author': author, 'review_text': review_text, 'url': url, 'usefulness':usefulness}, ignore_index=True)
    
    # repeat that process for the first 25 reviews for the movie (if that many exist)
    return frame

In [30]:

# instantiate a new BS4 element tree from the top 1,000 list
response = requests.get(URL_MOVIES_250)
        
soup = bs4.BeautifulSoup(response.text, 'html.parser')

# fetch the URLS for all the movies and store them in a list
movie_links = [m.find('a').attrs['href'] for m in soup.find_all('span', {'class': 'lister-item-header'})]

# For each movie, grab as many reviews as possibles
for link in movie_links:
    print(".", end='') # rough loading bar to show progress 
    # modify the review url to sort the reviews by the total number of votes for usefulness
    m_url = f"https://www.imdb.com{link}reviews?sort=totalVotes&dir=desc&ratingFilter=0"
    df = get_movie_info(m_url, df) # for each movie, get the top 25 most voted on reviews
    # time.sleep(0.05) # set a timeout so that we don't overwhelm IMBDb 

df # display our resultant DataFrame

..........................................................................................................................................................................................................................................................

Unnamed: 0,name,stars,date,author,review_text,url,usefulness
0,The Shawshank Redemption,10,26 November 2003,carflo,Why do I want to write the 234th comment on Th...,imdb.com/review/rw0349418/,"3,589/4,006"
1,The Shawshank Redemption,10,27 August 2002,weswalker,"Can Hollywood, usually creating things for ent...",imdb.com/review/rw0349147/,"1,706/1,983"
2,The Shawshank Redemption,,8 February 2001,speedreid,I have never seen such an amazing film since I...,imdb.com/review/rw0348718/,"1,624/1,865"
3,The Shawshank Redemption,10,10 February 2006,kaspen12,"In its Oscar year, Shawshank Redemption (writt...",imdb.com/review/rw1288098/,"1,274/1,437"
4,The Shawshank Redemption,8,3 August 2001,Si Cole,I believe that this film is the best story eve...,imdb.com/review/rw0348829/,"959/1,329"
...,...,...,...,...,...,...,...
6232,The Truman Show,10,1 December 2006,Valdemort,I loved this movie. Everything about it. It is...,imdb.com/review/rw1534725/,46/50
6233,The Truman Show,1,16 February 2006,jasonlain-1,"""The Truman Show"" is a film in which Jim Carey...",imdb.com/review/rw1293622/,13/49
6234,The Truman Show,1,31 January 1999,SashaLyn,This movie was one of the worst movies that I ...,imdb.com/review/rw0438742/,11/48
6235,The Truman Show,1,24 October 2009,senditere,Yet again this film illustrates that the Holly...,imdb.com/review/rw2146377/,12/45


In [31]:
df_csv = df.to_csv()
with open('Top_250_IMDb.csv', 'w') as f:
    f.write(df_csv)