# Web-Scraping part of IMDB project

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

imdbTop50 = "https://www.imdb.com/list/ls053501318/"
page = requests.get(imdbTop50)
# get whole html of page
soup = BeautifulSoup(page.text,'html.parser')

# get main content of page
content = soup.find("div", class_="lister-list")

actorDf = pd.DataFrame()
counter = 0
ranks = []
actorNames = []
linksToActorSite = []

for row in content.find_all(class_="lister-item mode-detail"):
    counter += 1
    ranks.append(counter)
    for col in row.find_all(class_="lister-item-content"):
        # Extract the URLs of each actor page
        linksToActorSite.append(col.find("h3").find("a", href=True)["href"])
        
        for nameHeader in col.h3.a:
            # Extract the names of actors in top 50 list
            actorNames.append(nameHeader.text.replace("\n", ""))
            
# Create a DataFrame from the extracted data        
actorDf["Ranking"] = ranks
actorDf.set_index("Ranking")
actorDf["Actor Names"] = actorNames
actorDf

Unnamed: 0,Ranking,Actor Names
0,1,Johnny Depp
1,2,Al Pacino
2,3,Robert De Niro
3,4,Kevin Spacey
4,5,Denzel Washington
5,6,Russell Crowe
6,7,Brad Pitt
7,8,Angelina Jolie
8,9,Leonardo DiCaprio
9,10,Tom Cruise


### Define function to send request to websites and to parse HTML Content unsing BeautifulSoup

In [2]:
def get_html_page(url):
                               #to verify that the request comes from an browser and so get acces to the site
    response=requests.get(url, headers={"User-Agent":"Mozilla/5.0"})
    # check successfull response
    if response.status_code != 200:
        raise Exception(f'{response.status_code} Failed to load page {url}')
    # Parse HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

### Get more data of actors (about and awards) in 2. website: imdb actor pages

In [3]:
baseUrl = "https://www.imdb.com"
about = []
winnigAwards = []

# Iterate through the actor URLs
for link in linksToActorSite:
    actorUrl = baseUrl + link
    # Send a GET request to the actor page
    soup = get_html_page(actorUrl)
    # Extract about section of Actor, first chars are enought. No need for whole biography
    about_temp = soup.find("div", class_="ipc-html-content-inner-div").text[:800] + "..."
    about.append(about_temp)
    # Extract winning Awards of Actor
    awards_temp = soup.find("label", class_="ipc-metadata-list-item__list-content-item").text[:3]
    # to get clean data: getting rid off char at end of string to have just the number
    winnigAwards.append(int(awards_temp.replace("w", "")))
    
# adding extracted data to actors dataframe
actorDf["About"] = about
actorDf["Winnig Awards"] = winnigAwards
actorDf

Unnamed: 0,Ranking,Actor Names,About,Winnig Awards
0,1,Johnny Depp,"John Christopher ""Johnny"" Depp II was born on ...",81
1,2,Al Pacino,"Alfredo James ""Al"" 'Pacino established himself...",53
2,3,Robert De Niro,"One of the greatest actors of all time, Robert...",68
3,4,Kevin Spacey,"Kevin Spacey Fowler, better known by his stage...",59
4,5,Denzel Washington,"Denzel Hayes Washington, Jr. was born on Decem...",87
5,6,Russell Crowe,"Russell Ira Crowe was born in Wellington, New ...",37
6,7,Brad Pitt,"William Bradley ""Brad"" Pitt was born on Decemb...",122
7,8,Angelina Jolie,Angelina Jolie is an Academy Award-winning act...,58
8,9,Leonardo DiCaprio,Few actors in the world have had a career quit...,102
9,10,Tom Cruise,"In 1976, if you had told fourteen-year-old Fra...",57


### Get corresponding Movies-data of the actors in 3. website: IMDB search title page

In [7]:
# title_type=feature to get all Movies, count parameter to display all movies of actor, role parameter so search with actor id
searchSite = "/search/title/?title_type=feature&count=250&role="
# Actor Ranking is foreign key of actor dataframe to keep track to which actor the movies belong
movieDf = pd.DataFrame(columns=['Actor Ranking', 'Title', 'Year', 'Genre', 'Rating'])
actorIds = []
counter = 1

# extract all actor ids from actor site to search for all movies (id format: nm000000)
for link in linksToActorSite:
    actorIds.append(link[6:16])

for actorId in actorIds: 
    searchTitlesUrl = baseUrl + searchSite + actorId
    soup = get_html_page(searchTitlesUrl)
    # Extract section that contains all Movies (default sorting by IMDB is by popularity)
    filmography = soup.find(class_="lister-list")
    for movie in filmography.find_all(class_="lister-item-content"):
        header = movie.find("h3", class_="lister-item-header")
        # Extract the details of each Movie
        title = header.find("a").text # titles are in german!
        year = header.find(class_="lister-item-year text-muted unbold").text.replace("(I)", "")
        ratingElement = movie.find(class_="inline-block ratings-imdb-rating")
        # set value then it exists, because some movies dont have data of rating and genres!
        rating = ratingElement.text.replace("\n", "") if ratingElement else "Null"
        genreElement = movie.find(class_="genre")
        genre = genreElement.text.replace("\n", "").strip() if genreElement else "Null"
        
        # insert movie data as row in dataframe
        insertRow = {'Actor Ranking':counter, 'Title':title, 'Year':year,'Genre':genre, 'Rating': rating}
        movieDf = pd.concat([movieDf, pd.DataFrame([insertRow])], ignore_index=True)
        
    counter += 1
  
movieDf
    

Unnamed: 0,Actor Ranking,Title,Year,Genre,Rating
0,1,Mord im Orient-Express,(2017),"Crime, Drama, Mystery",6.5
1,1,Fluch der Karibik,(2003),"Action, Adventure, Fantasy",8.1
2,1,21 Jump Street,(2012),"Action, Comedy, Crime",7.2
3,1,Tusk,(2014),"Comedy, Horror",5.3
4,1,Charlie und die Schokoladenfabrik,(2005),"Adventure, Comedy, Family",6.7
...,...,...,...,...,...
3555,50,Caffeine,(2006),Comedy,5.4
3556,50,Descendant,(2003),"Horror, Thriller",4.1
3557,50,Side Effects,(2005),"Comedy, Drama, Romance",4.9
3558,50,Bug Buster,(1998),"Comedy, Horror, Sci-Fi",3.9


### Data Cleaning:

In [8]:
# not much to do, because data was mostly collected in right format but:
# delete all rows without a release Year of the Movie:
#because this are announced Titles that are not released jet and dont have any data 
# (delets 387 of 3560 movie-rows)
movieDf = movieDf[movieDf["Year"] != ""]

# delete all rows without a rating of the Movie, because they are not jet released too or not relevant enough:
# (delets 98 of 3173 remaining movie-rows)
movieDf = movieDf[movieDf["Rating"] != "Null"]
# now we have a complete dataset with all relevant movies, that where released and have a rating
movieDf

Unnamed: 0,Actor Ranking,Title,Year,Genre,Rating
0,1,Mord im Orient-Express,(2017),"Crime, Drama, Mystery",6.5
1,1,Fluch der Karibik,(2003),"Action, Adventure, Fantasy",8.1
2,1,21 Jump Street,(2012),"Action, Comedy, Crime",7.2
3,1,Tusk,(2014),"Comedy, Horror",5.3
4,1,Charlie und die Schokoladenfabrik,(2005),"Adventure, Comedy, Family",6.7
...,...,...,...,...,...
3555,50,Caffeine,(2006),Comedy,5.4
3556,50,Descendant,(2003),"Horror, Thriller",4.1
3557,50,Side Effects,(2005),"Comedy, Drama, Romance",4.9
3558,50,Bug Buster,(1998),"Comedy, Horror, Sci-Fi",3.9


### Export dataframes to csv files:

In [9]:
actors_csv_data = actorDf.to_csv('top50Actors.csv')
movies_csv_data = movieDf.to_csv('moviesOfActors.csv')