# IMPORTING REQUIRED LIBRARIES

In [39]:
from selenium import webdriver                          #For opening browser       
import pandas as pd                                     #For creating dataframe
from bs4 import BeautifulSoup                           #For scraping data
import time                                             #For waiting
import numpy as np                                      #For array
from tqdm import tqdm                                   #For progress bar

# OPENING REQUIRED SITE IN BROWSER

In [40]:
driver = webdriver.Firefox()

In [41]:
driver.get('https://m.imdb.com/chart/starmeter/')

# GETTING HTML CODE OF OPENED WEBPAGE

In [42]:
html = driver.page_source

# CONVERTING EXTRACTED HTML CODE INTO SOUP TO SCRAP DATA

In [43]:
soup = BeautifulSoup(html,"html.parser")

# EXTRACTING NAMES OF CELEBRITIES AND LINKS TO THEIR INDIVIDUAL WEBPAGES

In [44]:
a_list = soup.find_all("a")
name_list = []
link_list = []
for i in range(43,343,3):
    name_list.append(a_list[i].get('aria-label'))
    link_list.append(a_list[i].get('href'))

# EXTRACTING THE FAMOUS WORK OF THESE CELEBRITIES

In [45]:
famous_list = []
a_list1 = soup.findAll('a',{'data-testid':'nlib-known-for-title'})
for i in a_list1:
    famous_list.append(i.text)

# FUNCTION TO EXTRACT DATE OF BIRTH OF THE CELEBRITIES

In [46]:
def get_dob(soup):
    try:
        sec = soup.findAll('section', {'data-testid':'PersonalDetails'})[0]
        dob = sec.findAll('li',{'data-testid':'nm_pd_bl'})[0].findAll('li')[0].text
    except Exception as e:
        dob = np.nan
    return dob

# FUNCTION TO EXTRACT HEIGHT OF THE CELEBRITIES

In [47]:
def get_height(soup):
    try:
        sec = soup.findAll('section', {'data-testid':'PersonalDetails'})[0]
        height = sec.findAll('li',{'data-testid':'nm_pd_he'})[0].findAll('li')[0].text
    except Exception as e:
        height = np.nan
    return height

# FUNCTION TO EXTRACT BIRTHPLACE OF THE CELEBRITIES

In [48]:
def get_birthplace(soup):
    try:
        sec = soup.findAll('section', {'data-testid':'PersonalDetails'})[0]
        bp = sec.findAll('li',{'data-testid':'nm_pd_bl'})[0].findAll('li')[1].text
    except Exception as e:
        bp = np.nan
    return bp

# FUNCTION TO EXTRACT AWARD DATA OF THE CELEBRITIES

In [49]:
def get_awards(soup):
    try:
        div_list = soup.findAll('div', {'data-testid':'awards'})
        awards = div_list[0].findAll('li',{'class':'ipc-inline-list__item'})[0].text
    except Exception as e:
        awards = np.nan
    return awards

# FUNCTION TO EXTRACT ROLES OF THE CELEBRITIES

In [50]:
def get_role(soup):
    role_list = []
    try:
        li_list = soup.findAll('ul', {'class':'ipc-inline-list ipc-inline-list--show-dividers sc-7f1a92f5-4 kIoyyw baseAlt'})[0].findAll('li')
        for i in li_list:
            role_list.append(i.text)
    except Exception as e:
        role_list.append(np.nan)
    return role_list

In [51]:
get_role(soup)

[nan]

# MAIN LOOP TO ITERATE THROUGH EACH CELEBRITY'S WEBPAGE AND EXTRACT DATA

In [52]:
dob_list = []
height_list = []
role_list = []
award_list = []
birth_list = []
for i in tqdm(link_list, desc="Processing Links"):
    driver.get(f'https://m.imdb.com/{i}')
    #time.sleep(2)
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    dob_list.append(get_dob(soup))
    height_list.append(get_height(soup))
    birth_list.append(get_birthplace(soup))
    role_list.append(get_role(soup))
    award_list.append(get_awards(soup))

Processing Links: 100%|██████████| 100/100 [11:27<00:00,  6.88s/it]


# CREATING DATAFRAME FOR COLLECTED DATASET 

In [53]:
df = pd.DataFrame({
    'Rank':np.arange(1,101),
    'Name': name_list,
    'Date_of_birth': dob_list,
    'height': height_list,
    'Role': role_list,
    'Awards': award_list,
    'Famous_for': famous_list,
    'Birth_place': birth_list
})

# CHECKING THE DATASET

In [54]:
df.head()

Unnamed: 0,Rank,Name,Date_of_birth,height,Role,Awards,Famous_for,Birth_place
0,1,Aria Mia Loberti,,5′ 3½″ (1.61 m),[Actress],,All the Light We Cannot See (2023– ),
1,2,Brie Larson,"October 1, 1989",5′ 6¾″ (1.70 m),"[Actress, Producer, Director]",66 wins & 68 nominations total,Room (2015),"Sacramento, California, USA"
2,3,Evan Ellingson,"July 1, 1988",6′ 1″ (1.85 m),[Actor],1 nomination,My Sister's Keeper (2009),"Los Angeles County, California, USA"
3,4,Carla Gugino,"August 29, 1971",5′ 5″ (1.65 m),"[Actress, Producer, Soundtrack]",3 wins & 15 nominations,Watchmen (2009),"Sarasota, Florida, USA"
4,5,Matthew Perry,"August 19, 1969",6′ (1.83 m),"[Actor, Producer, Writer]",6 wins & 34 nominations total,Friends (1994–2004),"Williamstown, Massachusetts, USA"


In [55]:
df.tail()

Unnamed: 0,Rank,Name,Date_of_birth,height,Role,Awards,Famous_for,Birth_place
95,96,Brad Pitt,"December 18, 1963",5′ 11″ (1.80 m),"[Producer, Actor, Executive]",120 wins & 224 nominations total,Fight Club (1999),"Shawnee, Oklahoma, USA"
96,97,Adèle Exarchopoulos,"November 22, 1993",5′ 8″ (1.73 m),[Actress],28 wins & 31 nominations,Blue Is the Warmest Colour (2013),"Paris, France"
97,98,Juno Temple,"July 21, 1989",5′ 2″ (1.57 m),"[Actress, Soundtrack]",6 wins & 19 nominations total,Atonement (2007),"London, England, UK"
98,99,Synnove Karlsen,"July 30, 1996",5′ 5¾″ (1.67 m),[Actress],,Last Night in Soho (2021),"Glasgow, Scotland, UK"
99,100,Denzel Washington,"December 28, 1954",6′ 0½″ (1.84 m),"[Actor, Producer, Director]",87 wins & 209 nominations total,Fences (2016),"Mount Vernon, New York, USA"


In [56]:
df.sample(20)

Unnamed: 0,Rank,Name,Date_of_birth,height,Role,Awards,Famous_for,Birth_place
1,2,Brie Larson,"October 1, 1989",5′ 6¾″ (1.70 m),"[Actress, Producer, Director]",66 wins & 68 nominations total,Room (2015),"Sacramento, California, USA"
54,55,Jessie Buckley,"December 28, 1989",5′ 7″ (1.70 m),"[Actress, Music Department, Soundtrack]",25 wins & 79 nominations total,I'm Thinking of Ending Things (2020),"Killarney, Ireland"
70,71,Rachael Harris,"January 12, 1968",5′ 1″ (1.55 m),"[Actress, Camera and Electrical Department, Pr...",2 wins & 2 nominations,The Hangover (2009),"Worthington, Ohio, USA"
81,82,Anya Taylor-Joy,"April 16, 1996",5′ 8″ (1.73 m),"[Actress, Soundtrack]",19 wins & 44 nominations total,The Queen's Gambit (2020– ),"Miami, Florida, USA"
44,45,Jodie Foster,"November 19, 1962",5′ 3″ (1.60 m),"[Actress, Director, Producer]",61 wins & 53 nominations total,The Silence of the Lambs (1991),"Los Angeles, California, USA"
83,84,Jacob Elordi,"June 26, 1997",6′ 5″ (1.96 m),"[Actor, Writer, Producer]",2 wins & 2 nominations,The Kissing Booth (2018),"Brisbane, Australia"
46,47,Rachele Brooke Smith,"November 7, 1987",5′ 6″ (1.68 m),"[Actress, Producer]",2 wins,The Nice Guys (2016),
0,1,Aria Mia Loberti,,5′ 3½″ (1.61 m),[Actress],,All the Light We Cannot See (2023– ),
97,98,Juno Temple,"July 21, 1989",5′ 2″ (1.57 m),"[Actress, Soundtrack]",6 wins & 19 nominations total,Atonement (2007),"London, England, UK"
92,93,Stephen Graham,"August 3, 1973",5′ 6″ (1.68 m),"[Actor, Producer, Writer]",13 wins & 21 nominations total,Boiling Point (2021),"Liverpool, Merseyside, England, UK"


# SAVING THE DATASET

In [57]:
df.to_csv('celebrity_data.csv',index= False)