# This notebook generates the pickle files based on the `.rda` files in the original repository
## Doing this in a notebook for the sake of convenience and because it only has to be done once. 

In [None]:
import numpy as np
import pandas as pd
import warnings
import pyreadr
from importlib import reload
import wrangle as w


In [None]:
warnings.simplefilter("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


In [None]:
contestants = pyreadr.read_r(
    path='data/rpdr_contestants.rda')['rpdr_contestants']
contestants.season = contestants.season.str[1:].astype('int')
contestants.age = contestants.age.astype('int')
contestants.dob = pd.to_datetime(contestants.dob)
hometown_df = contestants.hometown.str.split(', ', expand=True)
hometown_df = hometown_df.rename(
    columns={0: 'city', 1: 'state'}).drop(columns=[2])
hometown_df.city = hometown_df.city.astype('category')
hometown_df.state = hometown_df.state.astype('category')
contestants = pd.concat([contestants, hometown_df],
                        axis=1).drop(columns=['hometown'])
contestants.loc[contestants.contestant == 'Jaida Essence Hall'].index = [144]
contestants.loc[contestants.contestant == 'Crystal Methyd'].index = [146]
crystal = contestants.loc[144].copy(deep=True)
contestants.loc[144] = contestants.loc[146]
contestants.loc[146] = crystal
contestants = contestants.rename(columns={'contestant': 'queen_name'})
winners = contestants.groupby('season').first().queen_name.to_list()
contestants['winner'] = contestants.queen_name.isin(winners)
contestants.head()


In [111]:
def get_episode_type(row: pd.Series) -> str:
    if row.finale > 0:
        return 'Finale'
    elif row.penultimate > 0:
        return 'Penultimate'
    else:
        return 'Standard'


def check_elimination(row: pd.Series) -> pd.Series:
    if row.eliminated > '0':
        row.outcome = 'ELIM'
    return row

contep = pyreadr.read_r('data/rpdr_contep.rda')['rpdr_contep']
contep = contep.dropna(subset=['outcome'])
contep.season = contep.season.str[1:].astype(np.int8)
contep.episode = contep.episode.astype(np.int8)
contep['rank'] = contep['rank'].fillna(-1).astype(np.int8)
contep['rank'].name = 'ranking'
contep = contep.apply(check_elimination, axis=1)
contep['etype'] = contep.apply(get_episode_type, axis=1)
contep.etype = contep.etype.astype('category')

contep = contep.drop(
    columns=['penultimate', 'finale', 'participant', 'eliminated'])
for index, name in contestants.queen_name.to_dict().items():
    contep.loc[contep.contestant == name, 'contestant'] = index
contep.minichalw = contep.minichalw.apply(
    lambda w: True if w == '1' else False)
contep.missc = contep.missc.apply(lambda w: True if w == '1' else False)
contep = contep.rename(
    columns={'contestant': 'queen_id', 'missc': 'ms_congeniality'})
contep.to_pickle('data/contep.pkl')
contestants.to_pickle('data/contestants.pkl')


In [None]:
def get_episode_type(row: pd.Series) -> str:
    if row.special > 0:
        return 'Special'
    elif row.finale > 0:
        return 'Finale'
    else:
        return 'Standard'


episodes = pyreadr.read_r('data/rpdr_ep.rda')['rpdr_ep']
episodes.season = episodes.season.str[1:]
episodes.minic = episodes.minic.fillna('None')
episodes.numqueens = episodes.numqueens.fillna(0)
episodes.runwaytheme = episodes.runwaytheme.fillna('Not Listed')

bottom_drop = ['bottom'+str(i) for i in range(1, 8)]
minicw_drop = ['minicw'+str(i) for i in range(1, 5)]
episodes = episodes.drop(columns=[
                         'special', 'finale', 'eliminated1', 'eliminated2'] + bottom_drop + minicw_drop)
# episodes.to_csv('data/episodes.csv')


In [51]:
episodes = pd.read_pickle()
episodes.season = episodes.season.astype(np.uint8)
episodes.episode = episodes.episode.astype(np.uint8)
episodes.airdate = pd.to_datetime(episodes.airdate)
episodes.numqueens = episodes.numqueens.astype(np.uint16)
episodes.main_challenge = episodes.main_challenge.astype('category')
episodes = episodes.rename(columns={'airdate': 'air_date', 'runwaytheme': 'runway_theme', 'numqueens': 'n_queens',
                           'minic': 'mini_challenge', 'lipsyncartist': 'lipsync_artist', 'lipsyncsong': 'lipsync_song'})
episodes.to_pickle('data/episodes.pkl')


FileNotFoundError: [Errno 2] No such file or directory: 'data/episodes.csv'

## Reducing the number of outcomes
The original dataset had a number of one-off or situational outcomes. This section was used to build the code to clean these outcomes

In [160]:
reload(w)
queens, contep, episodes = w.acquire_rpdr_data()
train, validate, test = w.split_queens((queens, contep, episodes))
df = pd.concat([train,validate,test])
df.outcome.value_counts()


SAFE     459
HIGH     247
WIN      185
BTM      176
ELIM     144
LOW      122
GUEST     61
WDR        2
Name: outcome, dtype: int64

In [173]:
bottoms = df[df.outcome == 'BTM'][['season','episode','nickname','queen_name','outcome','rank']].sort_values(by=['season','rank'],ascending=[True, False])
elim = df[df.outcome == 'ELIM'][['season','episode','nickname','queen_name','outcome','rank']].sort_values(by=['season','rank'],ascending=[True, False])