In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Extract the top 1000 actors and actresses from IMDb website
There are 500 actors and 500 actresses
The link is https://www.imdb.com/list/ls058011111/?sort=list_order,asc&mode=detail&page=1

In [4]:
names=[]
for page_number in range(10):
    page=requests.get("https://www.imdb.com/list/ls058011111/?sort=list_order,asc&mode=detail&page="+str(page_number+1))
    soup=BeautifulSoup(page.text)
    for div in soup.find_all(name='div',attrs={'class':'lister-item-content'}):
        for a in div.find_all(name='h3', attrs={'class':'lister-item-header'}):
            names.append(a.text.strip())

In [92]:
print(names[0])

1. 
 Robert De Niro


Each item in names list contains two rows: the rank (order) in the fist line and the name starts from a new line. We will remove the rank and only keep the name (the second line).

In [7]:
actors=[]
for name in names:
    actors.append(name.split('\n',1)[1][1:])

In [93]:
print(len(actors))
print(actors[:4])

1000
['Robert De Niro', 'Jack Nicholson', 'Tom Hanks', 'Marlon Brando']


## Web scraping the biography information from Wikipedia for the actors and actresses.
For the majority of these actors and actresses, i.e. 944 out of 1000, these is a biography table which gives the information such as birth date, occupation, years active, spouse(s), etc. We scrape the the birth date, occupation and spouse(s) information for these 944 actors and actresses.

In [54]:
borns=[]
occupations=[]
actors_new=[]
spouses=[]
for actor in actors:
    page=requests.get("https://en.wikipedia.org/wiki/"+actor)
    soup=BeautifulSoup(page.text)
    my_table=soup.find('table',{'class':'infobox biography vcard'})
    if my_table!=None:
        actors_new.append(actor)
        born_list=[]
        for born in my_table.find_all(name='span',attrs={'style':'display:none'}):
            born_list.append(born.text.strip())
        borns.append(born_list)
        occupation_list=[]
        for occup in my_table.find_all(name='td',attrs={'class':'role'}):
            occupation_list.append(occup.text.strip())
        occupations.append(occupation_list)
        spouse_list=[]
        for married in my_table.find_all(name='div',attrs={'style':'position:relative;display:inline-block;line-height:normal;'}):
            spouse_list.append(married.text.strip()) 
        spouses.append(spouse_list)

The lists borns, occupation, actors_new, and spouses all have length 944. The list spouses is a list of lists, where each item contains the spouse(s) information for the corresponding actor or actress. For example, Robert De Niro had two spouses: Diahnne Abbott from 1976 to 1988, and Grace Hightower from 1997 to 2018.

In [98]:
print(len(borns))
print(len(actors_new))
print(len(occupations))
print(actors_new[0],spouses[0])

944
944
944
Robert De Niro ['Diahnne Abbott(m.\xa01976; div.\xa01988)', 'Grace Hightower(m.\xa01997; separated\xa02018)']


### Next we create varaibles that indicate first marriage, second marriage, third marriage, and fourth marriage.
For example, spouse_first indates the first marraige inforamtion, including the spouse name and marriage(divorce) years. It shows 'Never married' if the spouse(s) information is empty from Wikipedia.

In [99]:
spouse_first=[]
spouse_second=[]
spouse_third=[]
spouse_fourth=[]
for spouse in spouses:
    if spouse!=[]:
        spouse_first.append(spouse[0])
    else:
        spouse_first.append('Never married')
    if len(spouse)>=2:
        spouse_second.append(spouse[1])
    else:
        spouse_second.append('None')
    if len(spouse)>=3:
        spouse_third.append(spouse[2])
    else:
        spouse_third.append('None')
    if len(spouse)>=4:
        spouse_fourth.append(spouse[3])
    else:
        spouse_fourth.append('None')
        


## Create a datafram by combining all the information we collected.

In [59]:
data_actors=pd.DataFrame({'Actor_name':actors_new,'Born':borns,'Occupation':occupations,'spouse':spouses,
                         'spouse_first':spouse_first,'spouse_second':spouse_second,'spouse_third':spouse_third,
                         'spouse_fourth':spouse_fourth})
data_actors.to_csv('actors.csv')

In [62]:
print(data_actors.describe())

           Actor_name Born Occupation spouse   spouse_first spouse_second  \
count             944  944        944    944            944           944   
unique            942  926        411    690            690           315   
top     Neve Campbell   []    [Actor]     []  Never married          None   
freq                2    4        197    254            254           629   

       spouse_third spouse_fourth  
count           944           944  
unique          128            53  
top            None          None  
freq            817           892  


We notice that 254 out of 944 actors or actresses never get married, at least from the biography on Wikipedia.

#### Now let's compute the rate of first marraige ends in divorce.

In [101]:
first_divorced=[]
for spouse_first in data_actors['spouse_first']:
    first_divorced.append('div' in spouse_first)

sum(first_divorced)

374

The divorce rate of the first marraige is 54.2% based on 690 actors and actresses.

In [88]:
sum(first_divorced)/data_actors['spouse_first'].nunique()

0.5420289855072464

### past statistics show that
$\bullet$ First marriage divorce rate = 45% – 50%;

$\bullet$ Second marriage divorce rate = 60% – 67%;

$\bullet$ Third marriage divorce rate = 73% – 74%.

According to divorce statistics (https://www.wf-lawyers.com/divorce-statistics-and-facts/), every 13 seconds, there is one divorce in America. That equates to 277 divorces per hour, 6,646 divorces per day, 46,523 divorces per week, and 2,419,196 divorces per year. Now let's test if the divorce rate of the first marriage of 54.2% based on 690 actors and actresses is significant higer than the general first marriage divorce rate of 47.5% (middle of 45%-50%). We used 2,419,196 divorces per year out of 5,093,044, and get approximately the divorce rate of 47.5%.

In [102]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
counts = np.array([374, 2419196])
nobs = np.array([690, 5093044])
stat, pval = proportions_ztest(counts, nobs)
print('{0:0.3f}'.format(pval))


0.000


A p-value of 0 indicates that the first marraige divorce rate for the actors and actresses is statistically significant higher than that of the general population in the US.

### We can perform the similar study for the second and the third marriage divorce rates.

In [103]:
second_divorced=[]
for spouse_second in data_actors['spouse_second']:
    second_divorced.append('div' in spouse_second)

In [104]:
sum(second_divorced)/data_actors['spouse_second'].nunique()

0.5142857142857142

We can see that the second marriage divorce rate for the actors and actresses is 51.4%. It is much lower than that of 60%-67% for the general popualtion. However, we do not have the exact counts for the second marraige divorces in the US. We will temporarily use 600 divorces out of 1000. I believe that the actual numbers are much higher, thus the p-value should be smaller than what we got here.

In [119]:
counts = np.array([sum(second_divorced), 600])
nobs = np.array([data_actors['spouse_second'].nunique(), 1000])
stat, pval = proportions_ztest(counts, nobs)
print('{0:0.3f}'.format(pval))

0.007


A p-value of 0.007 indicates that the second marriage divorce rate for the actors and actresses is statistically significant lower than that for the general population. What a surprise!

Similarly, the third marraige divorce rate of 50% for the actors and actressess is significant lower than that of 73%-74% for the general population with a p-value of at least 0.000.

In [120]:
third_divorced=[]
for spouse_third in data_actors['spouse_third']:
    third_divorced.append('div' in spouse_third)

In [121]:
sum(third_divorced)/data_actors['spouse_third'].nunique()

0.5

In [122]:
counts = np.array([sum(third_divorced), 73])
nobs = np.array([data_actors['spouse_third'].nunique(), 100])
stat, pval = proportions_ztest(counts, nobs)
print('{0:0.3f}'.format(pval))

0.000
