In [1]:
import pandas as pd
from sklearn.manifold import TSNE

In [85]:
# Load data
castaways = pd.read_csv('data/raw/castaways.csv')
castaway_details = pd.read_csv('data/raw/castaway_details.csv')
confessionals = pd.read_csv('data/raw/confessionals.csv')

In [86]:
# Get total confessional count for each castaway
confessionals = confessionals.loc[:, ['castaway_id', 'season', 'confessional_count']].groupby(['castaway_id', 'season']).sum()

In [93]:
castaways.result = [castaway[15] if pd.notna(castaway[15]) else castaway[14] for castaway in castaways.values]

In [94]:
castaways.drop(['city', 'jury_status', 'original_tribe', 'version', 'order', 'episode',
                'version_season'],
    axis=1, inplace=True)

In [95]:
# Ignore Redemption Island/EOE
castaways = castaways.groupby(['castaway_id', 'season']).max().reset_index()

In [106]:
castaways.columns

Index(['castaway_id', 'season', 'season_name', 'full_name', 'castaway', 'age',
       'state', 'personality_type', 'day', 'result', 'prop_sur'],
      dtype='object')

In [96]:
m_days = list(castaways.loc[:, ['season', 'day']].groupby(['season']).max()[['day']].day)
castaways['prop_sur'] = [row[8]/m_days[row[1] - 1] for row in castaways.values]

In [97]:
castaway_details.drop(['full_name', 'personality_type', 'date_of_birth', 'date_of_death', 'occupation',
                        'ethnicity', 'race'], axis=1, inplace=True)

In [98]:
all_contestants = confessionals.merge(castaways,
                                    on=['castaway_id', 'season']).merge(castaway_details,
                                    on='castaway_id').dropna()

In [99]:
all_contestants

Unnamed: 0,castaway_id,season,confessional_count,season_name,full_name,castaway,age,state,personality_type,day,result,prop_sur,short_name,gender,poc
0,US0001,1,2,Survivor: Borneo,Sonja Christopher,Sonja,63,California,ENFP,3.0,1st voted out,0.076923,Sonja,Female,White
1,US0002,1,11,Survivor: Borneo,B.B. Andersen,B.B.,64,Kansas,ESTJ,6.0,2nd voted out,0.153846,B.B.,Male,White
2,US0003,1,9,Survivor: Borneo,Stacey Stillman,Stacey,27,California,ENTJ,9.0,3rd voted out,0.230769,Stacey,Female,White
3,US0004,1,19,Survivor: Borneo,Ramona Gray,Ramona,29,New Jersey,ISTJ,12.0,4th voted out,0.307692,Ramona,Female,POC
4,US0005,1,11,Survivor: Borneo,Dirk Been,Dirk,23,Wisconsin,ISFP,15.0,5th voted out,0.384615,Dirk,Male,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758,US0618,42,6,Survivor: 42,Marya Sherron,Marya,47,Indiana,INFP,5.0,2nd voted out,0.238095,Marya,Female,White
762,US0622,42,21,Survivor: 42,Rocksroy Bailey,Rocksroy,43,Nevada,ESTJ,17.0,2nd jury member,0.809524,Rocksroy,Male,POC
764,US0624,42,7,Survivor: 42,Swati Goel,Swati,19,California,ISTJ,9.0,4th voted out,0.428571,Swati,Female,POC
765,US0625,42,18,Survivor: 42,Tori Meehan,Tori,24,Arizona,ENFJ,17.0,3rd jury member,0.809524,Tori,Female,White


In [100]:
bio = all_contestants.loc[:, ['full_name', 'season_name', 'state', 'age', 'result']]
all_contestants.drop(['castaway_id', 'season_name', 'full_name', 'short_name', 'castaway', 'day', 'result'],
                    axis=1, inplace=True)

In [101]:
# Make season categorical
all_contestants.season = [str(szn) for szn in all_contestants.season]

In [102]:
features = pd.get_dummies(all_contestants)

In [103]:
projected = TSNE(n_components=2, random_state=1416).fit_transform(features)



In [104]:
bio.season_name = [szn_name.split(':')[1] for szn_name in bio.season_name]

In [105]:
pd.concat([bio, pd.DataFrame(projected)], axis=1).to_json('data/processed/tsne-results.json', orient='index')

NameError: name 'all_contestants' is not defined