In [3]:
import requests
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np
import re

## Web scraping - wikipedia.org

In [255]:
WIKIPEDIA_URL = 'https://en.wikipedia.org'

# Getting links for every movie on wikipedia from year 1990 to 2023 using BeautifulSoup.
all_links = []
for year in range(1990, 2024):
    response = requests.get(WIKIPEDIA_URL + f'/wiki/List_of_American_films_of_{year}')
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.select(selector='table > tbody > tr > td > i > a')
        all_links.extend(links)
    else:
        print("Error retrieving the page:", response.status_code)

# Creating list of full links.
movie_links = [WIKIPEDIA_URL + movie.get('href') for movie in all_links]

# Deleting duplicates from the list of movie links.
movie_links = list(set(movie_links))

In [257]:
# Getting details of every movie from the list from wikipedia.
all_movies = []
errors = []
for movie_link in movie_links:
    response_wiki = requests.get(movie_link)
    if response_wiki.status_code == 200:
        movie_soup = BeautifulSoup(response_wiki.text, 'html.parser')
        # Creating list of keys for dictionary.
        labels = [label.getText() for label in movie_soup.select('th.infobox-label')]
        labels.insert(0, 'Title')
        # Values for dictionary.
        records = [ record.getText() for record in movie_soup.select('td.infobox-data')]
        records.insert(0, movie_link[30:])
        # Creating dictionaries with data for every movie.
        movie_dictionary = dict(zip(labels, records))
        all_movies.append(movie_dictionary)
    else:
        print(f'Error {response_wiki.status_code} retrieving the page: {movie_link}' )
        errors.append(movie_link)

In [180]:
# Saving errors to .txt file.
with open('../data/01_raw/errors.txt', 'w') as file:
    for error in errors:
        file.write(f'{error}\n')

In [72]:
# Creating DataFrame from all dictionaries with raw informations about movies that need to be cleaned.
raw_movies_df = pd.DataFrame(all_movies)

# Checking how many columns are in df to evaluate if web scraping was done correctly.
print(len(raw_movies_df.columns))

169


In [73]:
# There are way too many columns. Some movies had too catch keys that are useless.
# Let's see in which place usefull columns end by checking number of missing values.
raw_movies_df.isnull().sum().head(30)

Title                      0
Directed by               18
Written by              3380
Produced by              250
Starring                 143
Cinematography           480
Edited by                372
Music by                 435
Productioncompany       6440
Distributed by           288
Release dates           4420
Running time             110
Country                 1927
Language                 871
Budget                  2457
Box office              1364
Screenplay by           5083
Story by                6877
Productioncompanies     2871
Release date            4075
Countries               6631
Released                7470
Genre                   7563
Length                  7675
Label                   7493
Producer                7869
Languages               7665
Based on                5065
Theme music composer    8273
Country of origin       8139
dtype: int64

In [74]:
# Useless columns start after 'based on'.
# Removing useless columns.
raw_movies_df = raw_movies_df.iloc[:, list(range(21)) + [27]]

# Also let's remove all movies that haven't got information on box office.
# This columns is the target of this supervised learning project so records without it are also useless.
# I'm doing this already here to decrease number of records that I will scrape data from rottentomatoes.com for.
# The reason of missing data for most of these movies is simple - they weren't screened in cinema, only on DVD,
# TV or streaming platforms.
raw_movies_df.dropna(subset=['Box office'], inplace=True)

# More in depths data cleansing will be done after splitting dataset into train and test sets.

# Saving wikipedia DataFrame to csv.
raw_movies_df.to_csv('../data/01_raw/wikipedia.csv', index=False)

## Web scraping - rottentomatoes.com 

In [78]:
# Preparing links for rottentomatoes.com from where I will scrape the rest of the data needed for this project.
# Many titles from wikipedia ends with _(film). I need to remove this phrase.
raw_movies_df['Title'].str[-7:].value_counts()

Title
_(film)    1801
0_film)     193
2_film)     190
9_film)     180
1_film)     180
           ... 
_Served       1
ecticut       1
ms_Club       1
:_Miami       1
Sicko         1
Name: count, Length: 2899, dtype: int64

In [79]:
patterns_to_replace = {
        r'_\(film\)$': '',
        r'%27': '',
        r'[:.!?,]': '',
        r'%26': 'and',
        r'-': '_'
    }

# Formatting title in the way that is used on rottentomatoes.com. Removing and replacing characters.
# Removing phrase '_(film)' from the end of the titles.
for pattern, replacement in patterns_to_replace.items():
    raw_movies_df['Title_rotten'] = raw_movies_df['Title_rotten'].str.replace(pattern, replacement, regex=True)

# Also in many cases on wikipedia if there are couple of movies with the same title there is addition of the year in the end.
raw_movies_df['Title_rotten'].str[-12:].value_counts()

Title_rotten
_(2019_film)    70
_(1996_film)    66
_(2010_film)    66
_(2009_film)    63
_(2008_film)    60
                ..
Margin_Call      1
Late_Quartet     1
Never_Sleeps     1
Mystery_Team     1
Devils_Due       1
Name: count, Length: 5163, dtype: int64

In [80]:
# On rottentomatoes instead of for example '_(2019_film)' I need '_2019'.
# Also for some of the movies it will probably be good to check option without the year if the link with year won't be found.
# So I need 2 columns with titles to scrape data from rottentomatoes.

# Creating column without the year.
raw_movies_df['Title_no_year'] = raw_movies_df['Title_rotten'].str.replace(r'_\(\d{4}_film\)$', '', regex=True)

# Creating column with _dddd format of the year.
raw_movies_df['Title_rotten'] = raw_movies_df['Title_rotten'].str.replace(r'(_)\((\d{4})_film\)$', r'\1\2', regex=True)

# Checking if the changes were done correctly.
raw_movies_df['Title_rotten'].str[-5:].value_counts()

Title_rotten
_2019    70
_2010    66
_1996    66
_2009    63
_2008    60
         ..
let_2     1
_Tall     1
id90s     1
llion     1
s_Due     1
Name: count, Length: 3310, dtype: int64

In [81]:
# Checking if there is anything else to remove.
raw_movies_df['Title_no_year'].str[-21:].value_counts()

Title_no_year
_the_End_of_the_World    4
_(2015_American_film)    4
_(2016_American_film)    4
_(2010_American_film)    3
he_Planet_of_the_Apes    3
                        ..
Moonlight_Mile           1
Child_44                 1
Letters_to_Juliet        1
Wonder_Park              1
Loverboy                 1
Name: count, Length: 6815, dtype: int64

In [82]:
# There are couple of films ending with pattern '_(dddd_American_film)' that also needs to be changed.

# Column without the year.
raw_movies_df['Title_no_year'] = raw_movies_df['Title_no_year'].str.replace(r'_\(\d{4}_American_film\)$', '', regex=True)

# Column with _dddd format of the year.
raw_movies_df['Title_rotten'] = raw_movies_df['Title_rotten'].str.replace(r'(_)\((\d{4})_American_film\)$', r'\1\2', regex=True)

In [83]:
# After running the program I discovered that if a movie title starts with 'the' it is often removed from rottentomatoes https address.
# So I need to take this into account while scraping data.
raw_movies_df['Title_no_year'].str[:4].value_counts()

Title_no_year
The_    1324
Amer      30
Love      29
Star      25
Blac      25
        ... 
Anom       1
Slin       1
T%C3       1
Lass       1
Pric       1
Name: count, Length: 2500, dtype: int64

In [84]:
# Creating 2 new columns with links without 'the'.
raw_movies_df['Title_no_the'] = raw_movies_df['Title_rotten'].str.replace(r'^The_', '', regex=True)
raw_movies_df['Title_no_the_no_year'] = raw_movies_df['Title_no_year'].str.replace(r'^The_', '', regex=True)

In [85]:
# After running program another time I discovered that 'A_' is often skipped in titles.
# Creating new columns.
raw_movies_df['Title_no_a'] = raw_movies_df['Title_rotten'].str.replace(r'^A_', '', regex=True)
raw_movies_df['Title_no_a_no_the'] = raw_movies_df['Title_no_the'].str.replace(r'^A_', '', regex=True)
raw_movies_df['Title_no_a_no_year'] = raw_movies_df['Title_no_year'].str.replace(r'^A_', '', regex=True)
raw_movies_df['Title_no_a_no_the_no_year'] = raw_movies_df['Title_no_the_no_year'].str.replace(r'^A_', '', regex=True)

In [90]:
# Getting details of every movie from rottentomatoes.com

ROTTENTOMATOES_URL = 'https://www.rottentomatoes.com/'
all_movies_rotten = []

# Function to scrape data from rottentomatoes.
def scrape_rotten(response, column_name):
    rotten_soup = BeautifulSoup(response.text, 'html.parser')
    try:
        year = rotten_soup.select_one('p.info[data-qa="score-panel-subtitle"]').getText().strip()
        reviews_score =  rotten_soup.select_one('score-board-deprecated').get('tomatometerscore')
        ratings_score =  rotten_soup.select_one('score-board-deprecated').get('audiencescore')
        reviews_count = rotten_soup.select_one('a[data-qa="tomatometer-review-count"]').getText().strip()
        ratings_count = rotten_soup.select_one('a[data-qa="audience-rating-count"]').getText().strip()
        movie_synopsis = rotten_soup.select_one('p[data-qa="movie-info-synopsis"]').getText().strip()
        # Creating list of keys for dictionary.
        labels = [label.getText().strip() for label in rotten_soup.select('b.info-item-label')]
        labels.extend(['Year', 'Reviews_score', 'Ratings_score', 'Reviews_count', 'Ratings_count', 'Synopsis'])
        # Creating list of values for dictionary.
        records = [record.getText().strip() for record in rotten_soup.select('span[data-qa="movie-info-item-value"]')]
        records.extend([year, reviews_score, ratings_score, reviews_count, ratings_count, movie_synopsis])
    # If any of the object in rottentomatoes page won't be found then function returns error.
    # In most cases this will happen when scraping a movie from the wrong year but with the same title.
    except AttributeError:
        labels = ['Error - scraping']
        records = [ROTTENTOMATOES_URL + row[column_name]]
        print(f'Error scraping the page: {ROTTENTOMATOES_URL + row[column_name]}')
    return labels, records

# If link from column 'Title_rotten' won't be found then links from columns without 'the', 'year' and 'a' should be
# tried before recognizing row as Error.
title_columns = ['Title_rotten', 'Title_no_the', 'Title_no_a', 'Title_no_a_no_the', 'Title_no_year', 'Title_no_the_no_year', 
                 'Title_no_a_no_year', 'Title_no_a_no_the_no_year']

# Iterating through all rows in dataframe.
for i, row in raw_movies_df.iterrows():
    accessed_the_page = False
    checked_titles = []
    # Iterating through all columns with titles.
    for title_column in title_columns:
        current_title = row[title_column]
        # Variables current_title and checked_titles are used to prevent attempting to connect to website using identical title.
        # It may often happen that for example value in 'Title_rotten' and 'Title_no_the' is the same.
        # In that case making requests for both of them is unnecessary.
        if current_title not in checked_titles:
            checked_titles.append(current_title)
            response = requests.get(ROTTENTOMATOES_URL + 'm/' + row[title_column])
            if response.status_code == 200:
                labels, records = scrape_rotten(response, title_column)
                accessed_the_page = True
                if labels != ['Error - scraping']:
                    break
    # Adding column 'Error' so the rows that haven't managed to reach connection wouldn't be skipped in dataframe.
    if not(accessed_the_page):
        labels = ['Error']
        records = [ROTTENTOMATOES_URL + row['Title_rotten']]
        print(f'Error {response.status_code} retrieving the page: {ROTTENTOMATOES_URL + row["Title_rotten"]}')
    # Creating dictionaries with data for every movie.
    movie_dictionary = dict(zip(labels, records))
    all_movies_rotten.append(movie_dictionary)

In [89]:
# Saving rottentomatoes DataFrame to csv.
rotten_movies_df = pd.DataFrame(all_movies_rotten)
rotten_movies_df.to_csv('../data/01_raw/rotten_tomatoes.csv', index=False)

## Partial data cleansing for further web scraping

In [73]:
# Importing raw data.
raw_wikipedia = pd.read_csv('../data/01_raw/wikipedia.csv')
rotten_tomatoes = pd.read_csv('../data/01_raw/rotten_tomatoes.csv')

In [74]:
# Checking for how many films for rottentomatoes there was problem with scraping the data.
print(rotten_tomatoes[['Error', 'Error - scraping']].notnull().sum())
print(f'Rows: {len(rotten_tomatoes)}')

Error               531
Error - scraping    411
dtype: int64
Rows: 6940


In [75]:
# Data for 942 out of 6940 wasn't scraped for rottentomatoes.com.
# The main reason is that on rottentomatoes.com for many films there are random numbers added before the title.
# For example: https://www.rottentomatoes.com/m/10011582-tron_legacy.
# Those numbers doesn't follow any visible pattern so scraping those movies by title is impossible.
# However there is different way to approach this problem.
# I can find hyperlinks for these movies on profile pages of their directors on rottentomatoes.com.
# For that I will need to clean columns in wikipedia dataframe that contain information about director and about
# release date of the movies.

# Let's take a look at release dates columns.
pd.set_option('display.max_colwidth', 140)
raw_wikipedia[['Release dates', 'Release date']].head(15)

Unnamed: 0,Release dates,Release date
0,"\nMarch 14, 1990 (1990-03-14) (Baltimore)\nApril 6, 1990 (1990-04-06) (United States)\n",
1,,\n1 December 1993 (1993-12-01) (France)\n[1]
2,"\nMay 20, 1999 (1999-05-20) (Cannes)\nJuly 2, 1999 (1999-07-02) (United States)\n",
3,,"\nDecember 22, 1999 (1999-12-22)\n"
4,\n1 May 1998 (1998-05-01) (United States)\n20 November 1998 (1998-11-20) (United Kingdom)\n24 December 1998 (1998-12-24) (Germany)\n,
5,,"\nAugust 9, 1991 (1991-08-09)\n"
6,"\nMay 11, 1996 (1996-05-11) (Cannes)\nNovember 29, 1996 (1996-11-29) (UK)\nMay 16, 1997 (1997-05-16) (United States)\n",
7,"\nMay 11, 1996 (1996-05-11) (Directors' Fortnight)\nOctober 11, 1996 (1996-10-11) (United States)\n",
8,,"\nMay 27, 1998 (1998-05-27)\n"
9,,"\nJune 12, 1998 (1998-06-12)\n"


In [76]:
# Release dates in wikipedia are split perfectly between 2 columns (when we add non-null records it gives 6940)
# depending whether the movie had one or more premieres.
# I need to remove places of premiere from both columns and chose only one date from 'Release dates' before I will be
# able to format columns to datetime format.

# Checking if all 'Release dates' have premiere in United States.
raw_wikipedia.fillna({'Release dates': ''}, inplace=True)
raw_wikipedia['Release dates'][~raw_wikipedia['Release dates'].str.contains('(United States)|(US)|(U.S.)') & (raw_wikipedia['Release dates'] != '')]

  raw_wikipedia['Release dates'][~raw_wikipedia['Release dates'].str.contains('(United States)|(US)|(U.S.)') & (raw_wikipedia['Release dates'] != '')]


24                  \nDecember 23, 1994 (1994-12-23)\n(limited)\nJanuary 13, 1995 (1995-01-13)\n(North America)
45      \nSeptember 24, 1999 (1999-09-24) (Sundance Film Festival)\nSeptember 21, 2001 (2001-09-21) (Germany)\n
52                                            \nMay 26, 1995 (1995-05-26) (Cannes)\nMay 10, 1996 (1996-05-10)\n
66                                          \nApril 23, 1999 (North America)\nApril 30, 1999 (United Kingdom)\n
112                                       \nSeptember 7, 1997 (1997-09-07) (TIFF)\nJune 19, 1998 (1998-06-19)\n
                                                         ...                                                   
6741                            \n8 August 2011 (2011-08-08) (Locarno)\n11 September 2011 (2011-09-11) (TIFF)\n
6779                                 \n7 September 2007 (2007-09-07) (Venice)\n20 September 2007 (2007-09-20)\n
6830                      \n12 February 2008 (2008-02-12) (BIFF)\n18 April 2008 (2008-04-18) (United Kin

In [77]:
# It appears that 410 rows don't have premiere in United States. So I won't be able to choose date this way.
# I decided to always choose the first date of the premiere because it fits the best to rottentomatoes.
# Unfortunately dates are saved in different formats: some are saved like that 11 May 1990 or that February 27, 1998
# and some even have only month and year. So first I need to transform all of these dates to ('yyyy-mm-dd').

wikipedia = raw_wikipedia.copy()

# Regular expressions to find dates in all formats.
date_patterns = [
    r'(\d{4}-\d{2}-\d{2})',
    r'(\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December) \d{4})',
    r'((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4})',
    r'((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}(?:st|nd|rd|th), \d{4})',
    r'(\d{4}-\d{2}(?!\-\d\d))'
]

# Function to extract dates and format them to datetime format.
def extract_date(df, column_name, patterns=date_patterns):
    selected_dates = None
     # I have to go through each pattern from dictionary separately otherwise function to_datetime() wouldn't know what to do.
    for pattern in patterns:
        # .first() to always choose the first date in the column.
        dates_of_next_type = df[column_name].str.extractall(pattern).groupby(level=0).first()
        dates_of_next_type = pd.to_datetime(dates_of_next_type[0]).to_frame()
        if selected_dates is None:
            selected_dates = dates_of_next_type
        else:
            # .min() to always choose the first date from 2 Series.
            selected_dates = pd.concat([selected_dates, dates_of_next_type], axis=1).min(axis=1)
    return selected_dates

# Extracting single dates for: 'Release dates' and 'Release date columns'.
wikipedia['Main release date'] = extract_date(wikipedia, 'Release dates')
wikipedia['Main release date 2'] = extract_date(wikipedia,'Release date')

# Combining both columns together.
wikipedia['Main release date'] = wikipedia['Main release date'].combine_first(wikipedia['Main release date 2'])

# Creating column that contains only year. It will be needed for scraping data.
wikipedia['Year - wikipedia'] = wikipedia['Main release date'].dt.year

# Removing old columns:
wikipedia.drop(['Main release date 2', 'Release date', 'Release dates'], axis=1, inplace=True)

In [83]:
# Preparing 'Directed by' column to web scraping.

patterns_to_replace = {
    '\n\n': '',
    r'Supervising Director[s]?\n': '',
    r'\(.*\)': '',
    r'\[.*\]': '',
    ', Jr.': ' Jr.',
    '\n': ',',
    # In some cases directors aren't split at all. I recognize these cases by finding small and large letters next to each other.
    # I'm making sure not to split names like 'McAdams' by adding condition that there can't be large-small-large combination.
    r'[^A-Z]([a-z\.ąćęłńóśźżäöüñéáò])([A-Z])': r'\1, \2'
}

# Adding ', ' between different options of many directors in one field and removing unnecessary descriptions.
for pattern, replacement in patterns_to_replace.items():
    wikipedia['Directed by'] = wikipedia['Directed by'].str.replace(pattern, replacement, regex=True)

wikipedia['Directed by'] = wikipedia['Directed by'].str.strip()

# Splitting 'Directed by' column to 2 columns.
# Usually there will be only 1 director but sometimes there are 2 importent directors like for example Russo brothers.
# If there are more than 2 directors I remove them because they won't be importent in analysis.
directed_by_split = wikipedia['Directed by'].str.split(',', n=2, expand=True)
wikipedia[['Director_1', 'Director_2']] = directed_by_split.iloc[:, :2]
wikipedia.drop('Directed by', axis=1, inplace=True)

In [84]:
# Splitting column in rottentomatoes that contains year and different informations.
# Leaving only year because genre and runtime are already contained in different columns.
rotten_tomatoes['Year'] = rotten_tomatoes['Year'].str.split(',', expand=True).iloc[:, 0]
rotten_tomatoes['Year'] = rotten_tomatoes['Year'].fillna(0).astype('int32')

# Concatinating data into one dataframe.
all_movies = pd.concat([wikipedia, rotten_tomatoes], axis=1)

In [85]:
# Checking if the previous scraping was correct by comparing release years of wikipedia and rottentomatoes.
# Difference of 1 year will be allowed because such difference could result from considering different premiere.
# If the difference is bigger than that then I will remove uncorrectly scraped data.
mask = (abs(all_movies['Year - wikipedia'] - all_movies['Year']) > 1) & (all_movies['Year'] != 0)
print(f'Incorrect scraping: {len(all_movies.loc[mask, :])}')
all_movies.loc[mask, 'Rating:':] = np.nan

Incorrect scraping: 210


## Web scraping - rottentomatoes.com  - method 2

In [285]:
# Getting links to rottentomatoes movies from directors of these movies.

all_movies['movie_link_from_director'] = ''
missing_movies_df = all_movies.loc[all_movies['Rating:'].isna()]
errors_directors = []

# Iterating through all rows in dataframe that are missing information from rottentomatoes.
for i, row in missing_movies_df.iterrows():
    director_link = ROTTENTOMATOES_URL + 'celebrity/' + row['Director_1'].replace(' ', '_')
    response_director = requests.get(director_link)
    if response_director.status_code == 200:
        director_soup = BeautifulSoup(response_director.text, 'html.parser')
        # Looking for the movie that was directed in the specific year.
        parent_elements = director_soup.select(f'tr[data-year$="{row["Year - wikipedia"]}"]')
        movie_links = []
        for parent in parent_elements:
            # Checking if the person was actually Director of the found movie.
            # It is necessary because he could have a different role like for example Producer.
            check_role = parent.select_one('td.celebrity-filmography__credits').getText().strip().split(',')
            if 'Director' in check_role:
                movie_links.append(parent.select_one('td.celebrity-filmography__title a').get('href'))
        # If the person directed more than 1 movie in given year then I won't risk assigning a link to the wrong movie.
        if len(movie_links) == 1:
            missing_movies_df.at[i, 'movie_link_from_director'] = movie_links[0]
    else:
        print(f'Error {response_director.status_code} retrieving the page: {director_link}' )
        errors_directors.append(director_link)

In [223]:
# Saving errors to .txt file.
with open('../data/01_raw/errors_directors.txt', 'wb') as file:
    for error in errors_directors:
        file.write(error.encode('utf-8') + b'\n')

In [283]:
# Scraping remaining movies from rottentomatoes.com by directors.

movies_to_scrape = missing_movies_df.loc[missing_movies_df['movie_link_from_director'] != '', :]
new_movies_list = []

# Iterating through all rows in dataframe that are missing information from rottentomatoes using new links.
for i, row in movies_to_scrape.iterrows():
    new_movie_link = ROTTENTOMATOES_URL + row['movie_link_from_director']
    response_new_movie = requests.get(new_movie_link)
    if response_new_movie.status_code == 200:
        # Scraping data for the movie.
        labels, records = scrape_rotten(response_new_movie, 'movie_link_from_director')
    else:
        labels = ['Error']
        records = [new_movie_link]
        print(f'Error {response_new_movie.status_code} retrieving the page: {new_movie_link}')
    # Creating dictionaries with data for every movie.
    new_movie_dictionary = dict(zip(labels, records))
    new_movies_list.append(new_movie_dictionary)

In [284]:
# Creating dataframe from newly scraped data.
new_movies_df = pd.DataFrame(new_movies_list)
new_movies_df.index = movies_to_scrape.index
movies_to_scrape.loc[:, 'Rating:':'Aspect Ratio:'] = new_movies_df.loc[:, :'Aspect Ratio:']
movies_to_scrape.loc[:, 'Error - scraping'] = new_movies_df.loc[:, 'Error - scraping']
# Assigning newly scraped data to the main dataframe on the correct indexes.
all_movies.loc[movies_to_scrape.index, :] = movies_to_scrape

## Web scraping - wikipedia.org #2

In [33]:
# During model development process I discovered that I'm missing key feature that could be very good indicator of the movie results.
# This feature being the knowledge which movies are sequels - such information could be crucial, because I could use previous movie 
# box office to predict its sequel. I hoped that these information will be found in Synopsis on rottentomatoes but in most cases it wasn't
# so this is why I scrape this additional data from wikipedia.

all_movies['Description'] = ''

# Iterating through all movies.
for i, row in all_movies.iterrows():
    movie_link = WIKIPEDIA_URL +  '/wiki/' + row['Title']
    response_movie = requests.get(movie_link)
    if response_movie.status_code == 200:
        # Scraping description of the movie that will be later used to extract the title of previous movie in the franchise/series.
        movie_soup = BeautifulSoup(response_movie.text, 'html.parser')
        all_movies.at[i, 'Description'] = movie_soup.select_one('div.mw-body-content div p:not(.mw-empty-elt)')
    else:
        print(f'Error {response_movie.status_code} retrieving the page: {movie_link}' )

# Formating to string to enable searching with regex.
all_movies['Description'] = all_movies['Description'].astype(str)

In [36]:
# Saving final dataframe to file.
all_movies.to_csv('../data/01_raw/all_movies.csv', index=False)