In [1]:
import json
import pandas as pd
import re
import requests
import logging
from pathlib import Path
from tqdm import tqdm
from time import sleep
from random import randint
from bs4 import BeautifulSoup

In [2]:
API_URL = 'https://imdb-api.tprojects.workers.dev'
HEADERS = {'User-Agent': 'Mozilla/5.0'}
IMDB_TITLE_PATTERN = r"/title/(tt\d+)/"
SAVE_DIRECTORY = Path(r"C:\Users\petro\Desktop\IMDB_parsing")
SAVE_FILMS_JSON = SAVE_DIRECTORY.joinpath('films_data_2.json')
SAVE_REVIEWS_JSON = SAVE_DIRECTORY.joinpath('reviews_data_2.json')
SAVE_FILMS_CSV = SAVE_DIRECTORY.joinpath('films_data_2.csv')
SAVE_REVIEWS_CSV = SAVE_DIRECTORY.joinpath('reviews_data_2.csv')
PAGES_RANGE = range(6, 16)  # range(1, 6)

In [8]:
def wait():
    sleep(randint(1, 3))

def get_page_results(page_num):
    count_per_page = 250  # [50, 100, 250]
    starting_position = 1 + (page_num - 1) * count_per_page
    url = f'https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,&user_rating=6.0,&adult=include&start={starting_position}&ref_=adv_nxt&sort=num_votes,desc&count={count_per_page}'

    result_page = requests.get(url, headers=HEADERS)
    # wait()

    if result_page.status_code != 200:
        logging.warn(f'Results at {starting_position} (url: {url}): code {result_page.status_code}')
        return []

    soup = BeautifulSoup(result_page.text)
    page_results = soup.find_all(class_='lister-item mode-advanced')
    return page_results

def get_reviews_results(title_id, reviews_results=None, url=None, page_num=0):
    if page_num > 3:
        return reviews_results

    if page_num == 0:
        reviews_results = []
        url = f'{API_URL}/reviews/{title_id}?option=helpfulness&sortOrder=desc'

    result = requests.get(url)
    if result.status_code != 200:
        logging.warn(f'Results of the reviews page {page_num}, film {title_id}: code {result.status_code}')
        return reviews_results
    wait()
    data = result.json()
    reviews_results += data['reviews']
    next_url = f'{API_URL}{data["next_api_path"]}'
    reviews_results = get_reviews_results(title_id, reviews_results, next_url, page_num + 1)
    return reviews_results
###

films_data = []
reviews_data = []
films_list = []
i = 0

for page_num in tqdm(PAGES_RANGE, 'pages'):
    page_results = get_page_results(page_num)
    for page_result in tqdm(page_results, f'films on page {page_num}'):
        i += 1
        try:
            # title_id = page_result.find('a').get('href')[7:-1]
            is_match = re.search(IMDB_TITLE_PATTERN, page_result.find('a').get('href'))
            title_id = is_match.group(1)

            # film_url = f'https://www.imdb.com/title/{title_id}/'
            # result_film = requests.get(film_url, headers=HEADERS)
            result_film = requests.get(API_URL + f'/title/{title_id}')
            if result_film.status_code != 200:
                logging.warn(f'Results of the film {title_id}: code {result_film.status_code}')
                continue
            data = result_film.json()
            if 'releaseDeatiled' in data:
                del data['releaseDeatiled']
            films_data.append(data)
            # print(result_film.status_code)
            # print(result_film.json())
            wait()
            result_reviews = get_reviews_results(title_id)
            reviews_data += result_reviews
        except Exception as e:
            logging.warn(f'film {title_id}')
            logging.exception(e)
#         break
#     break
        if i % 100 == 0:
            with open(SAVE_FILMS_JSON, 'w') as f:
                json.dump(films_data, f)
            with open(SAVE_REVIEWS_JSON, 'w') as f:
                json.dump(reviews_data, f)
    
with open(SAVE_FILMS_JSON, 'w') as f:
    json.dump(films_data, f)
with open(SAVE_REVIEWS_JSON, 'w') as f:
    json.dump(reviews_data, f)

df_films = pd.json_normalize(films_data)
df_reviews = pd.json_normalize(reviews_data)
df_films.to_csv(SAVE_FILMS_CSV)
df_reviews.to_csv(SAVE_REVIEWS_CSV)

pages:   0%|          | 0/10 [00:00<?, ?it/s]
films on page 6:   0%|          | 0/250 [00:00<?, ?it/s][A
films on page 6:   0%|          | 1/250 [00:16<1:07:18, 16.22s/it][A
films on page 6:   1%|          | 2/250 [00:28<58:26, 14.14s/it]  [A
films on page 6:   1%|          | 3/250 [00:46<1:05:15, 15.85s/it][A
films on page 6:   2%|▏         | 4/250 [01:01<1:02:46, 15.31s/it][A
films on page 6:   2%|▏         | 5/250 [01:13<58:23, 14.30s/it]  [A
films on page 6:   2%|▏         | 6/250 [01:26<55:45, 13.71s/it][A
films on page 6:   3%|▎         | 7/250 [01:43<59:58, 14.81s/it][A
films on page 6:   3%|▎         | 8/250 [01:58<1:00:04, 14.89s/it][A
films on page 6:   4%|▎         | 9/250 [02:18<1:06:49, 16.64s/it][A
films on page 6:   4%|▍         | 10/250 [02:38<1:09:50, 17.46s/it][A
films on page 6:   4%|▍         | 11/250 [02:54<1:08:29, 17.20s/it][A
films on page 6:   5%|▍         | 12/250 [03:13<1:10:32, 17.79s/it][A
films on page 6:   5%|▌         | 13/250 [03:30<1:09:01

In [9]:
films_files = [
    'films_data.csv',
    'films_data_2.csv'
]
films_files = [SAVE_DIRECTORY.joinpath(file) for file in films_files]

df_films = pd.concat(map(pd.read_csv, films_files)).drop(columns='Unnamed: 0')
df_films.head(3)

Unnamed: 0,id,review_api_path,imdb,contentType,productionStatus,title,image,images,plot,contentRating,...,rating.count,rating.star,award.wins,award.nominations,releaseDetailed.day,releaseDetailed.month,releaseDetailed.year,releaseDetailed.releaseLocation.country,releaseDetailed.releaseLocation.cca2,releaseDetailed.originLocations
0,tt0468569,/reviews/tt0468569,https://www.imdb.com/title/tt0468569,Movie,Released,The Dark Knight,https://m.media-amazon.com/images/M/MV5BMTMxNT...,['https://m.media-amazon.com/images/M/MV5BOTAx...,When the menace known as the Joker wreaks havo...,PG-13,...,2713440,9.0,162,163,18.0,7,2008,United States,US,"[{'country': 'United States', 'cca2': 'US'}, {..."
1,tt1375666,/reviews/tt1375666,https://www.imdb.com/title/tt1375666,Movie,Released,Inception,https://m.media-amazon.com/images/M/MV5BMjAxMz...,['https://m.media-amazon.com/images/M/MV5BMjIy...,A thief who steals corporate secrets through t...,PG-13,...,2408658,8.8,159,220,16.0,7,2010,United States,US,"[{'country': 'United States', 'cca2': 'US'}, {..."
2,tt0120737,/reviews/tt0120737,https://www.imdb.com/title/tt0120737,Movie,Released,The Lord of the Rings: The Fellowship of the Ring,https://m.media-amazon.com/images/M/MV5BN2EyZj...,['https://m.media-amazon.com/images/M/MV5BMjQ4...,A meek Hobbit from the Shire and eight compani...,PG-13,...,1912955,8.8,123,127,19.0,12,2001,United States,US,"[{'country': 'New Zealand', 'cca2': 'NZ'}, {'c..."


In [11]:
df_films.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3744 entries, 0 to 2495
Data columns (total 29 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   id                                       3744 non-null   object 
 1   review_api_path                          3744 non-null   object 
 2   imdb                                     3744 non-null   object 
 3   contentType                              3744 non-null   object 
 4   productionStatus                         3744 non-null   object 
 5   title                                    3744 non-null   object 
 6   image                                    3744 non-null   object 
 7   images                                   3744 non-null   object 
 8   plot                                     3744 non-null   object 
 9   contentRating                            3643 non-null   object 
 10  genre                                    3744 no

In [13]:
df_films["releaseDetailed.releaseLocation.cca2"].unique()

array(['US', 'PH', 'FR', 'MX', 'KR', 'IN', 'DK', 'HU', 'GB', 'IR', 'CN',
       'AR', 'BR', 'JP', 'ES', 'KZ', 'DE', 'BE', 'GR', 'PL', 'CA', 'HK',
       'TR', 'NO', 'HR', 'PT', 'IE', 'LV', 'LB', 'NL', 'IL', 'RU', 'AU',
       'IS', 'SE', 'IT', 'RO', 'BG', 'AE', 'ZA', 'MY', 'ID', 'TH', 'FI',
       'NZ', 'SG', 'TW', 'LT', 'AT', 'PK', 'EG', 'BD', 'CZ', 'CL', 'EE',
       'PR', 'BH', 'CO', 'UA', 'BY'], dtype=object)

In [14]:
df_films["releaseDetailed.releaseLocation.country"].unique()

array(['United States', 'Philippines', 'France', 'Mexico', 'South Korea',
       'India', 'Denmark', 'Hungary', 'United Kingdom', 'Iran', 'China',
       'Argentina', 'Brazil', 'Japan', 'Spain', 'Kazakhstan', 'Germany',
       'Belgium', 'Greece', 'Poland', 'Canada', 'Hong Kong', 'Turkey',
       'Norway', 'Croatia', 'Portugal', 'Ireland', 'Latvia', 'Lebanon',
       'Netherlands', 'Israel', 'Russia', 'Australia', 'Iceland',
       'Sweden', 'Italy', 'Romania', 'Bulgaria', 'United Arab Emirates',
       'South Africa', 'Malaysia', 'Indonesia', 'Thailand', 'Finland',
       'New Zealand', 'Singapore', 'Taiwan', 'Lithuania', 'Austria',
       'Pakistan', 'Egypt', 'Bangladesh', 'Czech Republic', 'Chile',
       'Estonia', 'Puerto Rico', 'Bahrain', 'Colombia', 'Ukraine',
       'Belarus'], dtype=object)

In [15]:
reviews_files = [
    'reviews_data.csv',
    'reviews_data_2.csv'
]
reviews_files = [SAVE_DIRECTORY.joinpath(file) for file in reviews_files]

df_reviews = pd.concat(map(pd.read_csv, reviews_files)).drop(columns='Unnamed: 0')
df_reviews.head(3)

Unnamed: 0,id,author,authorUrl,user_api_path,date,stars,heading,content,reviewLink,helpfulNess.votes,helpfulNess.votedAsHelpful,helpfulNess.votedAsHelpfulPercentage
0,rw5478826,MrHeraclius,https://www.imdb.com/user/ur87850731/?ref_=tt_urv,/user/ur87850731,2020-02-12T00:00:00.000Z,0,The Dark Knight,"Confidently directed, dark, brooding, and pack...",https://www.imdb.com/review/rw5478826,585,537,92
1,rw1914442,Smells_Like_Cheese,https://www.imdb.com/user/ur1293485/?ref_=tt_urv,/user/ur1293485,2008-07-20T00:00:00.000Z,10,The Batman of our dreams! So much more than a ...,I got to see The Dark Knight on Wednesday nigh...,https://www.imdb.com/review/rw1914442,1,933,93300
2,rw6606026,dseferaj,https://www.imdb.com/user/ur129557514/?ref_=tt...,/user/ur129557514,2021-02-17T00:00:00.000Z,10,This town deserves a better class of criminal!,This movie is a work of art. The finest sequel...,https://www.imdb.com/review/rw6606026,187,171,91


In [17]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 352164 entries, 0 to 227720
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype 
---  ------                                --------------   ----- 
 0   id                                    352164 non-null  object
 1   author                                352164 non-null  object
 2   authorUrl                             352164 non-null  object
 3   user_api_path                         352164 non-null  object
 4   date                                  352164 non-null  object
 5   stars                                 352164 non-null  int64 
 6   heading                               352161 non-null  object
 7   content                               352164 non-null  object
 8   reviewLink                            352164 non-null  object
 9   helpfulNess.votes                     352164 non-null  int64 
 10  helpfulNess.votedAsHelpful            352164 non-null  int64 
 11  helpfulNess.v

In [19]:
df_films.to_csv(SAVE_DIRECTORY.joinpath('imdb_films.csv'))
df_reviews.to_csv(SAVE_DIRECTORY.joinpath('imdb_reviews.csv'))