# This notebook generates the pickle files based on the `.rda` files in the original repository
## Doing this in a notebook for the sake of convenience and because it only has to be done once. 

In [3]:
import numpy as np
import pandas as pd
import warnings
import pyreadr

In [None]:
warnings.simplefilter("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
contestants = pyreadr.read_r(path='data/rpdr_contestants.rda')['rpdr_contestants']
contestants.season = contestants.season.str[1:].astype('int')
contestants.age = contestants.age.astype('int')
contestants.dob = pd.to_datetime(contestants.dob)
hometown_df = contestants.hometown.str.split(', ',expand=True)
hometown_df = hometown_df.rename(columns={0:'city',1:'state'}).drop(columns=[2])
hometown_df.city = hometown_df.city.astype('category')
hometown_df.state = hometown_df.state.astype('category')
contestants = pd.concat([contestants,hometown_df],axis=1).drop(columns=['hometown'])
contestants.to_pickle('data/contestants.pkl')

In [66]:
def get_episode_type(row:pd.Series)->str:
    if row.finale > 0:
        return 'Finale'
    elif row.penultimate > 0:
        return 'Penultimate'
    else:
        return 'Standard'
contep = pyreadr.read_r('data/rpdr_contep.rda')['rpdr_contep']
contep.season = contep.season.str[1:].astype(np.int8)
contep.episode = contep.episode.astype(np.int8)
contep['rank'] = contep['rank'].fillna(-1).astype(np.int8)
contep['rank'].name = 'ranking'
contep.missc = contep.missc.astype('bool')
contep.outcome = contep.outcome.astype('category')
contep.eliminated = contep.eliminated.astype('bool')
contep.participant = contep.participant.astype('bool')
contep['etype'] = contep.apply(get_episode_type,axis=1).astype('category')
contep.minichalw = contep.minichalw.fillna(-1).astype('bool')
contep.index = pd.MultiIndex.from_frame(contep[['season','episode']])
contep = contep.drop(columns=['season','episode','penultimate','finale'])
contep.to_pickle('data/contep.pkl')

In [94]:
def get_episode_type(row:pd.Series)->str:
    if row.special > 0:
        return 'Special'
    elif row.finale > 0:
        return 'Finale'
    else:
        return 'Standard'
episodes = pyreadr.read_r('data/rpdr_ep.rda')['rpdr_ep']
episodes.season = episodes.season.str[1:].astype(np.uint8)
episodes.episode = episodes.episode.astype(np.uint8)
episodes.index = pd.MultiIndex.from_frame(episodes[['season','episode']])
episodes.airdate = pd.to_datetime(episodes.airdate)
episodes.minic = episodes.minic.fillna('None').astype('string')
episodes.numqueens = episodes.numqueens.fillna(0).astype(np.uint8)

episodes['etype'] = episodes.apply(get_episode_type,axis=1)
episodes.runwaytheme = episodes.runwaytheme.fillna('Not Listed').astype('string')

for i in range(1,5):
    episodes['minicw' + str(i)] = episodes['minicw' + str(i)].fillna('None').astype('string')
bottom_drop = ['bottom' + str(i) for i in range(3,8)]
episodes = episodes.drop(columns=['season','episode','special','finale'] + bottom_drop)
episodes.to_pickle('data/episodes.pkl')