# The Bachelor

In this notebook I will dive into the data from the show The Bachelor. Lets get it.

In [1]:
%pip install --upgrade plotly pandas numpy plotly seaborn matplotlib nbformat


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Load the packages

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

### Format the Notebook

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

### Load the dataset

In [4]:
df_bachelor = pd.read_csv('historical_bachelor_contestants.csv')

### The initial look at the data set

In [5]:
print(df_bachelor.shape)
print(df_bachelor.columns)
print(df_bachelor.dtypes)

(479, 8)
Index(['Unnamed: 0', 'Age', 'Eliminated', 'Hometown', 'Name', 'Occupation',
       'Outcome', 'Season'],
      dtype='object')
Unnamed: 0      int64
Age           float64
Eliminated     object
Hometown       object
Name           object
Occupation     object
Outcome        object
Season          int64
dtype: object


In [6]:
print(df_bachelor.isna().any())

Unnamed: 0    False
Age            True
Eliminated     True
Hometown      False
Name          False
Occupation    False
Outcome        True
Season        False
dtype: bool


### Data Clean Up

In [7]:
df_bachelor['Hometown'] = df_bachelor['Hometown'].str.replace(r'\[[a-zA-Z0-9]+\]', '', regex=True)
df_bachelor['Name'] = df_bachelor['Name'].str.replace(r'\[[a-zA-Z0-9]+\]', '', regex=True)
last_name = df_bachelor['Hometown'].str.split(',', expand=True)
df_bachelor['State'] = last_name[1]
df_bachelor['City'] = last_name[0]
df_bachelor[:2]

Unnamed: 0.1,Unnamed: 0,Age,Eliminated,Hometown,Name,Occupation,Outcome,Season,State,City
0,0,23.0,Winner,"Chanute, Kansas",Amanda Marsh,Event Planner,,1,Kansas,Chanute
1,1,29.0,Runner-Up,"Miami, Florida",Trista Rehn,Miami Heat Dancer,,1,Florida,Miami


In [8]:
df_bachelor.head()

Unnamed: 0.1,Unnamed: 0,Age,Eliminated,Hometown,Name,Occupation,Outcome,Season,State,City
0,0,23.0,Winner,"Chanute, Kansas",Amanda Marsh,Event Planner,,1,Kansas,Chanute
1,1,29.0,Runner-Up,"Miami, Florida",Trista Rehn,Miami Heat Dancer,,1,Florida,Miami
2,2,24.0,Week 5,"Dallas, Texas",Shannon Oliver,Financial Management Consultant,,1,Texas,Dallas
3,3,24.0,Week 4,"Tempe, Arizona",Kim,Nanny,,1,Arizona,Tempe
4,4,22.0,Week 3,"Terra Haute, Indiana",Cathy Grimes,Graduate Student,,1,Indiana,Terra Haute


In [9]:
df_bachelor.tail()

Unnamed: 0.1,Unnamed: 0,Age,Eliminated,Hometown,Name,Occupation,Outcome,Season,State,City
474,26,28.0,,"Plano, Texas",Erin Landry,Human Resources Manager,Week 1,23,Texas,Plano
475,27,26.0,,"West Hollywood, California","Adrianne ""Jane"" Aver",Social Worker,Week 1,23,California,West Hollywood
476,28,26.0,,"Dallas, Texas",Laura Pellerito,Accountant,Week 1,23,Texas,Dallas
477,29,24.0,,"Austin, Texas",Revian Chang,Nurse,Week 1,23,Texas,Austin
478,30,25.0,,"Castle Pines, Colorado",Tahzjuan Hawkins,Business Development Associate,Week 1,23,Colorado,Castle Pines


In [10]:
winners_elem = df_bachelor.query('Eliminated == "Winner"')

In [11]:
win_outcome = df_bachelor.query('Outcome == "Winner"')

In [12]:
df_winners = pd.concat([winners_elem, win_outcome])

In [13]:
df_winners.drop(['Eliminated', 'Outcome', 'Unnamed: 0'],axis=1, inplace=True)

In [14]:
df_winners.columns

Index(['Age', 'Hometown', 'Name', 'Occupation', 'Season', 'State', 'City'], dtype='object')

In [15]:
df_winners.sort_values(by='Age')
age_count = df_winners.groupby('Age', as_index=False).agg({'Season' : pd.Series.count})
age_count.columns = ['Age', 'age-total']

In [16]:
fig = px.bar(age_count, x=age_count['Age'], y=age_count['age-total'])
fig.show()

In [17]:
df_winners['Count'] = df_winners['State']

cfg = df_winners.groupby(by=['State'], as_index=False).agg({'Count' : pd.Series.value_counts})
asd = df_winners[['City', 'State', 'Occupation']]
merged = pd.merge(asd, cfg, on='State')

In [18]:
fig = px.sunburst(merged, path=['State', 'City', 'Occupation'], color='Count', range_color=[1, 3])
fig.show()

In [19]:
work = df_bachelor
work['count'] = df_bachelor['Occupation']
work = work.groupby(by='Occupation', as_index=False).agg({'count' : pd.Series.count})
work.sort_values(inplace=True, by='count')
work = work[-20:]

In [20]:
fig = px.bar(work, x=work['count'], y=work['Occupation'], orientation='h', color=work['count'])
fig.show()

In [21]:
fix = df_bachelor
fix['Placement'] = pd.concat([fix['Eliminated'].iloc[:152], fix['Outcome'].iloc[152:]])
fix.fillna({'Placement': 0} , inplace=True)


In [22]:
fix[:5]

Unnamed: 0.1,Unnamed: 0,Age,Eliminated,Hometown,Name,Occupation,Outcome,Season,State,City,count,Placement
0,0,23.0,Winner,"Chanute, Kansas",Amanda Marsh,Event Planner,,1,Kansas,Chanute,Event Planner,Winner
1,1,29.0,Runner-Up,"Miami, Florida",Trista Rehn,Miami Heat Dancer,,1,Florida,Miami,Miami Heat Dancer,Runner-Up
2,2,24.0,Week 5,"Dallas, Texas",Shannon Oliver,Financial Management Consultant,,1,Texas,Dallas,Financial Management Consultant,Week 5
3,3,24.0,Week 4,"Tempe, Arizona",Kim,Nanny,,1,Arizona,Tempe,Nanny,Week 4
4,4,22.0,Week 3,"Terra Haute, Indiana",Cathy Grimes,Graduate Student,,1,Indiana,Terra Haute,Graduate Student,Week 3


In [23]:
fix['Placement'] = fix['Placement'].str.replace(r'\([a-zA-Z0-9]+\)', '', regex=True)
fix['Placement'] = fix['Placement'].str.replace(r'\[[a-zA-Z0-9]+\]', '', regex=True)
fix['Placement'] = fix['Placement'].str.replace('Week ', '')
fix['Placement'] = fix['Placement'].str.replace('Runner-up', '9')
fix['Placement'] = fix['Placement'].str.replace('Participating', '1')
fix['Placement'] = fix['Placement'].str.replace('Runner-Up', '9')
fix['Placement'] = fix['Placement'].str.strip()
fix['Placement'] = fix['Placement'].str.replace('11', '7')
fix['Placement'] = fix['Placement'].str.replace('15', '5')
fix['Placement'] = fix['Placement'].str.replace('16', '6')
fix['Placement'] = fix['Placement'].str.replace('12', '2')
fix['Placement'] = fix['Placement'].str.replace('13', '3')
fix['Placement'] = fix['Placement'].str.replace('17', '7')
fix['Placement'] = fix['Placement'].str.replace('19', '8')
fix['Placement'] = fix['Placement'].str.replace('18', '8')
fix['Placement'] = fix['Placement'].str.replace('14', '4')
fix['Placement'] = fix['Placement'].str.replace('21', '8')
fix['Placement'] = fix['Placement'].str.replace('Winner', '10')
fix['Placement'] = fix['Placement'].fillna('1')

fix.loc[355, 'Placement'] = '1'
fix.loc[356, 'Placement'] = '1'
fix['Placement'] = fix['Placement'].astype(int)


In [24]:
state_fix_dic = {
  'DC' : 'District of Columbia',
  'IL' : 'Illinois',
  'VA' : 'Virginia', 
  'CA' : 'California',
  'PA' : 'Pennsylvania',
  'FL' : 'Florida',
  'NC' : 'North Carolina',
  'MS' : 'Mississippi',
  'MD' : 'Maryland',
  'OH' : 'Ohio',
  'NY' : 'New York'}
states = {
    'Alaska': 'AK', 'Alabama': 'AL', 'Arkansas': 'AR', 'Arizona': 'AZ',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT',
    'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI',
    'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME',
    'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI',
    'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM',
    'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND',
    'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR',
    'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
    'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
    'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}


In [25]:
fix['State'] = fix['State'].str.replace('.', '', regex=False)

In [26]:
fix = fix.drop(['Eliminated', 'Hometown', 'Outcome', 'Unnamed: 0', 'count'], axis=1)

In [27]:
fix[:5]

Unnamed: 0,Age,Name,Occupation,Season,State,City,Placement
0,23.0,Amanda Marsh,Event Planner,1,Kansas,Chanute,10
1,29.0,Trista Rehn,Miami Heat Dancer,1,Florida,Miami,9
2,24.0,Shannon Oliver,Financial Management Consultant,1,Texas,Dallas,5
3,24.0,Kim,Nanny,1,Arizona,Tempe,4
4,22.0,Cathy Grimes,Graduate Student,1,Indiana,Terra Haute,3


In [28]:
winnners = fix.query('Placement == 10')
runner_ups = fix.query('Placement == 9')
thrid_place = fix.query('Placement == 8')

In [29]:
px.bar(winnners, x='State', y='Age', color='Season')

In [30]:
px.bar(runner_ups, x='State', y='Age', color='Season')

In [31]:
px.bar(thrid_place, x='State', y='Age', color='Season')

In [32]:
df_bachelor[:5]

Unnamed: 0.1,Unnamed: 0,Age,Eliminated,Hometown,Name,Occupation,Outcome,Season,State,City,count,Placement
0,0,23.0,Winner,"Chanute, Kansas",Amanda Marsh,Event Planner,,1,Kansas,Chanute,Event Planner,10
1,1,29.0,Runner-Up,"Miami, Florida",Trista Rehn,Miami Heat Dancer,,1,Florida,Miami,Miami Heat Dancer,9
2,2,24.0,Week 5,"Dallas, Texas",Shannon Oliver,Financial Management Consultant,,1,Texas,Dallas,Financial Management Consultant,5
3,3,24.0,Week 4,"Tempe, Arizona",Kim,Nanny,,1,Arizona,Tempe,Nanny,4
4,4,22.0,Week 3,"Terra Haute, Indiana",Cathy Grimes,Graduate Student,,1,Indiana,Terra Haute,Graduate Student,3
