# IMPORTING REQUIRED LIBRARIES

In [1]:
from selenium import webdriver                          #For opening browser       
import pandas as pd                                     #For creating dataframe
from bs4 import BeautifulSoup                           #For scraping data
import time                                             #For waiting
import numpy as np                                      #For array
from tqdm import tqdm                                   #For progress bar

# OPENING REQUIRED SITE IN BROWSER

In [2]:
driver = webdriver.Firefox()

In [3]:
driver.get('https://m.imdb.com/chart/starmeter/')

# GETTING HTML CODE OF OPENED WEBPAGE

In [4]:
html = driver.page_source

# CONVERTING EXTRACTED HTML CODE INTO SOUP TO SCRAP DATA

In [5]:
soup = BeautifulSoup(html,"html.parser")

# EXTRACTING NAMES OF CELEBRITIES AND LINKS TO THEIR INDIVIDUAL WEBPAGES

In [6]:
a_list = soup.find_all("a")
name_list = []
link_list = []
for i in range(43,343,3):
    name_list.append(a_list[i].get('aria-label'))
    link_list.append(a_list[i].get('href'))

# EXTRACTING THE FAMOUS WORK OF THESE CELEBRITIES

In [7]:
famous_list = []
a_list1 = soup.findAll('a',{'data-testid':'nlib-known-for-title'})
for i in a_list1:
    famous_list.append(i.text)

# FUNCTION TO EXTRACT DATE OF BIRTH OF THE CELEBRITIES

In [8]:
def get_dob(soup):
    try:
        sec = soup.findAll('section', {'data-testid':'PersonalDetails'})[0]
        dob = sec.findAll('li',{'data-testid':'nm_pd_bl'})[0].findAll('li')[0].text
    except Exception as e:
        dob = np.nan
    return dob

# FUNCTION TO EXTRACT HEIGHT OF THE CELEBRITIES

In [9]:
def get_height(soup):
    try:
        sec = soup.findAll('section', {'data-testid':'PersonalDetails'})[0]
        height = sec.findAll('li',{'data-testid':'nm_pd_he'})[0].findAll('li')[0].text
    except Exception as e:
        height = np.nan
    return height

# FUNCTION TO EXTRACT BIRTHPLACE OF THE CELEBRITIES

In [10]:
def get_birthplace(soup):
    try:
        sec = soup.findAll('section', {'data-testid':'PersonalDetails'})[0]
        bp = sec.findAll('li',{'data-testid':'nm_pd_bl'})[0].findAll('li')[1].text
    except Exception as e:
        bp = np.nan
    return bp

# FUNCTION TO EXTRACT AWARD DATA OF THE CELEBRITIES

In [11]:
def get_awards(soup):
    try:
        div_list = soup.findAll('div', {'data-testid':'awards'})
        awards = div_list[0].findAll('li',{'class':'ipc-inline-list__item'})[0].text
    except Exception as e:
        awards = np.nan
    return awards

# FUNCTION TO EXTRACT ROLES OF THE CELEBRITIES

In [12]:
def get_role(soup):
    role_list = []
    try:
        li_list = soup.findAll('ul', {'class':'ipc-inline-list ipc-inline-list--show-dividers sc-d8941411-2 cdJsTz baseAlt'})[0].findAll('li')
        for i in li_list:
            role_list.append(i.text)
    except Exception as e:
        role_list.append(np.nan)
    return role_list

In [15]:
get_role(soup)

['Actor', 'Producer', 'Director']

# MAIN LOOP TO ITERATE THROUGH EACH CELEBRITY'S WEBPAGE AND EXTRACT DATA

In [16]:
dob_list = []
height_list = []
role_list = []
award_list = []
birth_list = []
for i in tqdm(link_list, desc="Processing Links"):
    driver.get(f'https://m.imdb.com/{i}')
    #time.sleep(2)
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    dob_list.append(get_dob(soup))
    height_list.append(get_height(soup))
    birth_list.append(get_birthplace(soup))
    role_list.append(get_role(soup))
    award_list.append(get_awards(soup))

Processing Links:   0%|          | 0/100 [00:00<?, ?it/s]

Processing Links: 100%|██████████| 100/100 [11:57<00:00,  7.17s/it]


# CREATING DATAFRAME FOR COLLECTED DATASET 

In [17]:
df = pd.DataFrame({
    'Rank':np.arange(1,101),
    'Name': name_list,
    'Date_of_birth': dob_list,
    'height': height_list,
    'Role': role_list,
    'Awards': award_list,
    'Famous_for': famous_list,
    'Birth_place': birth_list
})

# CHECKING THE DATASET

In [18]:
df.head()

Unnamed: 0,Rank,Name,Date_of_birth,height,Role,Awards,Famous_for,Birth_place
0,1,Serinda Swan,"July 11, 1984",5′ 7″ (1.70 m),"[Actress, Producer, Director]",2 nominations,Coroner (2019–2022),"West Vancouver, British Columbia, Canada"
1,2,Alan Ritchson,"November 28, 1982",6′ 3″ (1.91 m),"[Actor, Producer, Director]",3 wins & 3 nominations,Teenage Mutant Ninja Turtles (2014),"Grand Forks, North Dakota, USA"
2,3,Sofia Boutella,"April 3, 1982",5′ 5″ (1.65 m),"[Actress, Soundtrack]",1 win & 2 nominations,Kingsman: The Secret Service (2014),"Bab El Oued, Algeria"
3,4,Sydney Sweeney,"September 12, 1997",5′ 3½″ (1.61 m),"[Actress, Producer]",4 wins & 17 nominations total,Euphoria (2019–2025),"Spokane, Washington, USA"
4,5,Katy M. O'Brian,"February 12, 1989",5′ 7″ (1.70 m),"[Actress, Stunts]",,Ant-Man and the Wasp: Quantumania (2023),"Indianapolis, Indiana, USA"


In [19]:
df.tail()

Unnamed: 0,Rank,Name,Date_of_birth,height,Role,Awards,Famous_for,Birth_place
95,96,Jesse Plemons,"April 2, 1988",5′ 10″ (1.78 m),"[Actor, Music Department, Producer]",5 wins & 43 nominations total,The Irishman (2019),"Dallas, Texas, USA"
96,97,Eve Best,"July 31, 1971",5′ 10″ (1.78 m),"[Actress, Director]",2 nominations,The King's Speech (2010),"London, England, UK"
97,98,Annie Murphy,"December 19, 1986",5′ 7″ (1.70 m),"[Actress, Writer, Soundtrack]",7 wins & 15 nominations total,Schitt's Creek (2015–2020),"Ottawa, Ontario, Canada"
98,99,Karen Allen,"October 5, 1951",5′ 5″ (1.65 m),"[Actress, Director, Soundtrack]",12 wins & 6 nominations,Raiders of the Lost Ark (1981),"Carrollton, Illinois, USA"
99,100,Alyssa Milano,"December 19, 1972",5′ 2″ (1.57 m),"[Actress, Producer, Writer]",8 wins & 8 nominations,Charmed (1998–2006),"Brooklyn, New York City, New York, USA"


In [20]:
df.sample(20)

Unnamed: 0,Rank,Name,Date_of_birth,height,Role,Awards,Famous_for,Birth_place
73,74,Madelyn Cline,"December 21, 1997",5′ 6″ (1.68 m),"[Actress, Soundtrack]",1 win & 1 nomination,Glass Onion (2022),"Charleston, South Carolina, USA"
63,64,Leonardo DiCaprio,"November 11, 1974",6′ (1.83 m),"[Producer, Actor, Writer]",103 wins & 272 nominations total,Inception (2010),"Hollywood, Los Angeles, California, USA"
47,48,Robert Patrick,"November 5, 1958",6′ (1.83 m),"[Actor, Producer]",5 wins & 6 nominations,Terminator 2: Judgment Day (1991),"Marietta, Georgia, USA"
3,4,Sydney Sweeney,"September 12, 1997",5′ 3½″ (1.61 m),"[Actress, Producer]",4 wins & 17 nominations total,Euphoria (2019–2025),"Spokane, Washington, USA"
29,30,Juno Temple,"July 21, 1989",5′ 2″ (1.57 m),"[Actress, Soundtrack]",6 wins & 22 nominations total,Atonement (2007),"London, England, UK"
0,1,Serinda Swan,"July 11, 1984",5′ 7″ (1.70 m),"[Actress, Producer, Director]",2 nominations,Coroner (2019–2022),"West Vancouver, British Columbia, Canada"
21,22,Jenna Ortega,"September 27, 2002",5′ 1″ (1.55 m),"[Actress, Producer, Soundtrack]",8 wins & 22 nominations total,Scream (2022),"Coachella Valley, California, USA"
92,93,Joel Kinnaman,"November 25, 1979",6′ 2½″ (1.89 m),"[Actor, Producer]",1 win & 5 nominations,The Killing (2011–2014),"Stockholm, Sweden"
57,58,Jeremy Allen White,"February 17, 1991",5′ 7″ (1.70 m),"[Actor, Writer, Producer]",5 wins & 16 nominations total,Shameless (2011–2021),"Brooklyn, New York, USA"
71,72,Sarah Rafferty,"December 6, 1972",5′ 9″ (1.75 m),"[Actress, Producer, Soundtrack]",,Suits (2011–2019),"New Canaan, Connecticut, USA"


# SAVING THE DATASET

In [21]:
df.to_csv('celebrity_data.csv',index= False)