# This notebook generates the pickle files based on the `.rda` files in the original repository
## Doing this in a notebook for the sake of convenience and because it only has to be done once. 

In [None]:
import numpy as np
import pandas as pd
import warnings
import pyreadr
from importlib import reload
import wrangle as w


In [None]:
warnings.simplefilter("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


In [None]:
contestants = pyreadr.read_r(
    path='data/rpdr_contestants.rda')['rpdr_contestants']
contestants.season = contestants.season.str[1:].astype('int')
contestants.age = contestants.age.astype('int')
contestants.dob = pd.to_datetime(contestants.dob)
hometown_df = contestants.hometown.str.split(', ', expand=True)
hometown_df = hometown_df.rename(
    columns={0: 'city', 1: 'state'}).drop(columns=[2])
hometown_df.city = hometown_df.city.astype('category')
hometown_df.state = hometown_df.state.astype('category')
contestants = pd.concat([contestants, hometown_df],
                        axis=1).drop(columns=['hometown'])
contestants.loc[contestants.contestant == 'Jaida Essence Hall'].index = [144]
contestants.loc[contestants.contestant == 'Crystal Methyd'].index = [146]
crystal = contestants.loc[144].copy(deep=True)
contestants.loc[144] = contestants.loc[146]
contestants.loc[146] = crystal
contestants = contestants.rename(columns={'contestant': 'queen_name'})
winners = contestants.groupby('season').first().queen_name.to_list()
contestants['winner'] = contestants.queen_name.isin(winners)
contestants.head()


In [None]:
def get_episode_type(row: pd.Series) -> str:
    if row.finale > 0:
        return 'Finale'
    elif row.penultimate > 0:
        return 'Penultimate'
    else:
        return 'Standard'


def check_elimination(row: pd.Series) -> pd.Series:
    if row.eliminated > '0':
        row.outcome = 'ELIM'
    return row


contep = pyreadr.read_r('data/rpdr_contep.rda')['rpdr_contep']
contep = contep.dropna(subset=['outcome'])
contep.season = contep.season.str[1:].astype(np.int8)
contep.episode = contep.episode.astype(np.int8)
contep['rank'] = contep['rank'].fillna(-1).astype(np.int8)
contep['rank'].name = 'ranking'
contep = contep.apply(check_elimination, axis=1)
contep['etype'] = contep.apply(get_episode_type, axis=1)
contep.etype = contep.etype.astype('category')
contep.outcome = contep.outcome.astype('category')
contep = contep.drop(
    columns=['penultimate', 'finale', 'participant', 'eliminated'])
for index, name in contestants.queen_name.to_dict().items():
    contep.loc[contep.contestant == name, 'contestant'] = index
contep.minichalw = contep.minichalw.apply(
    lambda w: True if w == '1' else False)
contep.missc = contep.missc.apply(lambda w: True if w == '1' else False)
contep = contep.rename(
    columns={'contestant': 'queen_id', 'missc': 'ms_congeniality'})


In [None]:
def get_episode_type(row: pd.Series) -> str:
    if row.special > 0:
        return 'Special'
    elif row.finale > 0:
        return 'Finale'
    else:
        return 'Standard'


episodes = pyreadr.read_r('data/rpdr_ep.rda')['rpdr_ep']
episodes.season = episodes.season.str[1:]
episodes.minic = episodes.minic.fillna('None')
episodes.numqueens = episodes.numqueens.fillna(0)
episodes.runwaytheme = episodes.runwaytheme.fillna('Not Listed')

bottom_drop = ['bottom'+str(i) for i in range(1, 8)]
minicw_drop = ['minicw'+str(i) for i in range(1, 5)]
episodes = episodes.drop(columns=[
                         'special', 'finale', 'eliminated1', 'eliminated2'] + bottom_drop + minicw_drop)
# episodes.to_csv('data/episodes.csv')


In [None]:
episodes = pd.read_csv('data/episodes.csv', index_col=0)
episodes.season = episodes.season.astype(np.uint8)
episodes.episode = episodes.episode.astype(np.uint8)
episodes.airdate = pd.to_datetime(episodes.airdate)
episodes.numqueens = episodes.numqueens.astype(np.uint16)
episodes.main_challenge = episodes.main_challenge.astype('category')
episodes = episodes.rename(columns={'airdate': 'air_date', 'runwaytheme': 'runway_theme', 'numqueens': 'n_queens',
                           'minic': 'mini_challenge', 'lipsyncartist': 'lipsync_artist', 'lipsyncsong': 'lipsync_song'})
episodes.to_pickle('data/episodes.pkl')


In [49]:
reload(w)
queens, contep, episodes = w.acquire_rpdr_data()
train, validate, test = w.split_queens(queens, contep, episodes)
train.sample(5)

Unnamed: 0,season,queen_name,age,dob,city,state,winner,rank,ms_congeniality,episode,...,minichalw,etype,air_date,nickname,runway_theme,n_queens,mini_challenge,lipsync_artist,lipsync_song,main_challenge
316,10,Miz Cracker,33,1984-04-19,Harlem,New York,False,5,False,7,...,False,Standard,2018-05-03,Snatch Game,Not Listed,8,Reading is Fundamental,Carly Rae Jepsen,Cut To The Feeling,Snatch Game
67,14,Maddy Morphosis,26,1994-09-02,Fayetteville,Arkansas,False,10,False,6,...,False,Standard,2022-02-11,Glamazon Prime,Glamazon Primetime,10,,Beyonce,Suga Mama,Fabrication
523,13,Gottmik,23,1996-08-19,Scottsdale,Arizona,False,3,False,6,...,False,Standard,2021-02-05,Disco-Mentary,Little Black Dress,11,Create eye-popping dresses from wallpaper,Blu Cantrell,Hit ‘Em Up Style (Oops!),Performing
232,6,Darienne Lake,41,1971-11-27,Rochester,New York,False,4,False,5,...,False,Standard,2014-03-24,Snatch Game,Not Listed,10,,Lisa Lisa & Cult Jam,Head to Toe,Snatch Game
572,7,Kandy Ho,28,1987-09-24,Cayey,Puerto Rico,False,10,False,1,...,False,Standard,2015-03-02,Born Naked,Not Listed,14,Model 2 looks,RuPaul,Geronimo,Fabrication
