# The Bachelor

In this notebook I will dive into the data from the show The Bachelor. Lets get it.

In [1]:
%pip install --upgrade plotly pandas numpy plotly seaborn matplotlib nbformat


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Load the packages

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

### Format the Notebook

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

### Load the dataset

In [4]:
df_bachelor = pd.read_csv('historical_bachelor_contestants.csv')

### The initial look at the data set

In [5]:
print(df_bachelor.shape)
print(df_bachelor.columns)
print(df_bachelor.dtypes)

(479, 8)
Index(['Unnamed: 0', 'Age', 'Eliminated', 'Hometown', 'Name', 'Occupation',
       'Outcome', 'Season'],
      dtype='object')
Unnamed: 0      int64
Age           float64
Eliminated     object
Hometown       object
Name           object
Occupation     object
Outcome        object
Season          int64
dtype: object


In [6]:
print(df_bachelor.isna().any())

Unnamed: 0    False
Age            True
Eliminated     True
Hometown      False
Name          False
Occupation    False
Outcome        True
Season        False
dtype: bool


In [7]:
df_bachelor.head()

Unnamed: 0.1,Unnamed: 0,Age,Eliminated,Hometown,Name,Occupation,Outcome,Season
0,0,23.0,Winner,"Chanute, Kansas",Amanda Marsh,Event Planner,,1
1,1,29.0,Runner-Up,"Miami, Florida",Trista Rehn,Miami Heat Dancer,,1
2,2,24.0,Week 5,"Dallas, Texas",Shannon Oliver,Financial Management Consultant,,1
3,3,24.0,Week 4,"Tempe, Arizona",Kim,Nanny,,1
4,4,22.0,Week 3,"Terra Haute, Indiana",Cathy Grimes,Graduate Student,,1


In [8]:
winners_elem = df_bachelor.query('Eliminated == "Winner"')

In [9]:
win_outcome = df_bachelor.query('Outcome == "Winner"')

In [15]:
df_winners = pd.concat([winners_elem, win_outcome])
df_winners

Unnamed: 0.1,Unnamed: 0,Age,Eliminated,Hometown,Name,Occupation,Outcome,Season
0,0,23.0,Winner,"Chanute, Kansas",Amanda Marsh,Event Planner,,1
25,0,27.0,Winner,"Gloucester, New Jersey",Helene Eksterowicz,School Psychologist,,2
50,0,22.0,Winner,"Huntington Beach, California",Jessica Bowlin,Student,,5
75,0,24.0,Winner,"Pembroke Pines, Florida",Jennifer Wilson,Teacher,,9
102,0,26.0,Winner,"San Francisco, California",Tessa Horst,Social Worker,,10
152,0,22.0,,"Malibu, California",Shayne Lamas[1],Actress,Winner,12
177,0,25.0,,"Dallas, Texas",Melissa Rycroft[1],Sales Representative,Winner,13
202,0,23.0,,"Geneva, Florida",Vienna Girardi[1][2],Marketing Representative,Winner,14
227,0,25.0,,"Morgantown, West Virginia",Emily Maynard[5],Children's Hospital Event Planner,Winner,15
257,0,28.0,,"Scottsdale, Arizona",Courtney Robertson[3][4],Model,Winner,16


In [16]:
df_winners.drop(['Eliminated', 'Outcome', 'Unnamed: 0'],axis=1, inplace=True)
df_winners

Unnamed: 0,Age,Hometown,Name,Occupation,Season
0,23.0,"Chanute, Kansas",Amanda Marsh,Event Planner,1
25,27.0,"Gloucester, New Jersey",Helene Eksterowicz,School Psychologist,2
50,22.0,"Huntington Beach, California",Jessica Bowlin,Student,5
75,24.0,"Pembroke Pines, Florida",Jennifer Wilson,Teacher,9
102,26.0,"San Francisco, California",Tessa Horst,Social Worker,10
152,22.0,"Malibu, California",Shayne Lamas[1],Actress,12
177,25.0,"Dallas, Texas",Melissa Rycroft[1],Sales Representative,13
202,23.0,"Geneva, Florida",Vienna Girardi[1][2],Marketing Representative,14
227,25.0,"Morgantown, West Virginia",Emily Maynard[5],Children's Hospital Event Planner,15
257,28.0,"Scottsdale, Arizona",Courtney Robertson[3][4],Model,16


In [17]:
df_winners.columns

Index(['Age', 'Hometown', 'Name', 'Occupation', 'Season'], dtype='object')

In [30]:
df_winners.sort_values(by='Age')
age_count = df_winners.groupby('Age', as_index=False).agg({'Season' : pd.Series.count})
age_count.columns = ['Age', 'age-total']
age_count

Unnamed: 0,Age,age-total
0,22.0,2
1,23.0,2
2,24.0,1
3,25.0,3
4,26.0,3
5,27.0,2
6,28.0,1
7,29.0,1


In [31]:
fig = px.bar(age_count, x=age_count['Age'], y=age_count['age-total'])
fig.show()