# Space Titanic

#### Data Description

![SpaceTitanicImage](<./SpaceTitanicData.png>)

#### Libraries

In [59]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder

### Retrieve and View Data

In [60]:
# Read in the train and test data
train = pd.read_csv('./spaceship-titanic/train.csv')
test = pd.read_csv('./spaceship-titanic/test.csv')
sampleSubmission = pd.read_csv('./spaceship-titanic/sample_submission.csv')

#### Notes on Data
> Predicting if the passenger was simiply transported or not (boolean).
> This is because that is the only values in the train which isn't in the test

### Process Data

#### Break Down passenger ID

In [61]:
passengers = train["PassengerId"]

group = []
person = []

for passenger in passengers:
    if isinstance(passenger, str):
        group.append(passenger[0:4])
        person.append(passenger[6:7])
    else:
        group.append(-1)
        person.append(-1)

train['group'] = group
train['person'] = person

In [62]:
uniquePerson = train['person'].unique()
print(uniquePerson)

['1' '2' '3' '4' '5' '6' '7' '8']


In [63]:
uniqueGroup = train['group'].unique()
print(len(uniqueGroup))

6217


In [64]:
passengers = test["PassengerId"]

group = []
person = []

for passenger in passengers:
    if isinstance(passenger, str):
        group.append(int(passenger[0:4]))
        person.append(int(passenger[6:7]))
    else:
        group.append(-1)
        person.append(-1)

test['group'] = group
test['person'] = person

In [65]:
def onHotEncodePerson(attribute: str, df: pd.DataFrame):
    per = df[[attribute]]  # Extract the "person" column

    # Initialize OneHotEncoder
    per_encoder = OneHotEncoder()

    # Fit and transform the "person" column
    per_cat_1hot = per_encoder.fit_transform(per)

    # Convert to a DataFrame with proper column names
    per_encoded_df = pd.DataFrame(
        per_cat_1hot.toarray(),  # Convert sparse matrix to a dense array
        columns=per_encoder.get_feature_names_out([attribute])  # Get feature names
    )
    df.drop(columns=attribute)
    # Concatenate the new DataFrame with the original DataFrame
    return pd.concat([df, per_encoded_df], axis=1)

train = onHotEncodePerson("person", train)
test = onHotEncodePerson("person", test)

train.drop(columns="group")
test.drop(columns="group")

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Name,person,person_1,person_2,person_3,person_4,person_5,person_6,person_7,person_8
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,...,Nelly Carsoning,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,...,Lerome Peckers,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,...,Sabih Unhearfus,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,...,Meratz Caltilter,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,...,Brence Harperez,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,...,Jeron Peter,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,...,Matty Scheron,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,...,Jayrin Pore,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,...,Kitakan Conale,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
cabinTrain = train['Cabin']

deck = []
num = []
side = []


for cabin in cabinTrain:
    if isinstance(cabin, str):  # Ensure cabin is not NaN and is a valid string
        deck.append(cabin[0])   # Index 0 for deck
        num.append(cabin[2])    # Index 2 for num
        side.append(cabin[4])   # Index 4 for side
    else:
        # Handle cases where the value is NaN or invalid
        deck.append(None)
        num.append(None)
        side.append(None)

train["Deck"] = deck
train["Num"] = num
train["Side"] = side

train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,person_2,person_3,person_4,person_5,person_6,person_7,person_8,Deck,Num,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,F,1,S


In [67]:
train['Side'], _ = pd.factorize(train['Side'])

In [68]:
cabinTest = test['Cabin']

deck = []
num = []
side = []


for cabin in cabinTest:
    if isinstance(cabin, str):  # Ensure cabin is not NaN and is a valid string
        deck.append(cabin[0])   # Index 0 for deck
        num.append(cabin[2])    # Index 2 for num
        side.append(cabin[4])   # Index 4 for side
    else:
        # Handle cases where the value is NaN or invalid
        deck.append(None)
        num.append(None)
        side.append(None)

test["Deck"] = deck
test["Num"] = num
test["Side"] = side

#### Remove Name

In [69]:
train = train.drop(columns=["Name"])
train = train.drop(columns="PassengerId")
test = test.drop(columns=["Name"])

train = train.drop(columns="Cabin")
test = test.drop(columns="Cabin")


#### Destination to Number

In [70]:
uniqueDestinations = train['Destination'].unique()
print(uniqueDestinations)
# Looks like there are some missing values that I need to account for

['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]


In [71]:
train['Destination'], _ = pd.factorize(train['Destination'])

In [72]:
test['Destination'], _ = pd.factorize(test['Destination'])

In [73]:
# Confirm there should be 4 different values
uniqueDestinationsNumbers = train['Destination'].unique()
print(uniqueDestinationsNumbers)

[ 0  1  2 -1]


In [74]:
# Deck Side
train['Deck'], _ = pd.factorize(train['Deck'])
train['Deck'], _ = pd.factorize(train['Deck'])

test['Side'], _ = pd.factorize(test['Side'])
test['Side'], _ = pd.factorize(test['Side'])

#### VIP to Number

In [75]:
train['VIP'], _ = pd.factorize(train['VIP'])
test['VIP'], _ = pd.factorize(test['VIP'])

#### CryoSleep to Number

In [76]:
train['CryoSleep'], _ = pd.factorize(train['CryoSleep'])
test['CryoSleep'], _ = pd.factorize(test['CryoSleep'])

#### HomePlanet to Number

In [77]:
train['HomePlanet'], _ = pd.factorize(train['HomePlanet'])
test['HomePlanet'], _ = pd.factorize(test['HomePlanet'])

# train = onHotEncodePerson("HomePlanet", train)
# test = onHotEncodePerson("HomePlanet", test)
# train = onHotEncodePerson("CryoSleep", train)
# test = onHotEncodePerson("CryoSleep", test)
# train = onHotEncodePerson("VIP", train)
# test = onHotEncodePerson("VIP", test)
# train = onHotEncodePerson("Destination", train)
# test = onHotEncodePerson("Destination", test)

In [78]:
test_ID = test["PassengerId"]

test = test.apply(pd.to_numeric, errors='coerce')
train = train.apply(pd.to_numeric, errors='coerce')

# Fill missing values with the integer average of their column
def fill_with_int_mean(col):
    if col.isnull().all():
        # Handle columns where all values are NaN
        return col.fillna(0)  # Replace NaN with a default value (e.g., 0)
    else:
        # Replace NaN with the integer mean
        return col.fillna(int(col.mean()))

test = test.apply(fill_with_int_mean)
train = train.apply(fill_with_int_mean)

In [79]:
train = train.applymap(lambda x: x.replace('/', '0') if isinstance(x, str) else x)
test = test.applymap(lambda x: x.replace('/', '0') if isinstance(x, str) else x)

# Convert all cells to numeric; non-numeric values become NaN
train = train.applymap(lambda x: pd.to_numeric(x, errors='coerce'))
# Convert all cells to numeric; non-numeric values become NaN
train.head()

  train = train.applymap(lambda x: x.replace('/', '0') if isinstance(x, str) else x)
  test = test.applymap(lambda x: x.replace('/', '0') if isinstance(x, str) else x)
  train = train.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,person_2,person_3,person_4,person_5,person_6,person_7,person_8,Deck,Num,Side
0,0,0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
1,1,0,0,24.0,0,109.0,9.0,25.0,549.0,44.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,1
2,0,0,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0,1
3,0,0,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0,1
4,1,0,0,16.0,0,303.0,70.0,151.0,565.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,1


#### Split Train Data Set

In [80]:
corr_matrix = train.corr()
corr_matrix['Transported'].sort_values(ascending=False)

Transported     1.000000
CryoSleep       0.424362
Destination     0.104817
person          0.066390
person_3        0.060768
person_2        0.048582
Num             0.046199
FoodCourt       0.046074
person_4        0.029559
group           0.021491
person_5        0.010575
ShoppingMall    0.010016
person_6        0.008031
person_7       -0.000529
Deck           -0.002091
person_8       -0.003257
VIP            -0.027802
Side           -0.028990
Age            -0.074245
person_1       -0.087695
HomePlanet     -0.094689
VRDeck         -0.204826
Spa            -0.218791
RoomService    -0.242046
Name: Transported, dtype: float64

In [81]:
# Shuffle the data and reset the index
train = train.sample(frac=1, random_state=42).reset_index(drop=True)

Y = train['Transported']

X = train.drop(columns='Transported')

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Train

In [82]:
model = RandomForestClassifier(max_depth=1000, n_estimators=25000, min_samples_split=200, min_samples_leaf=200, random_state=42)
# model = RandomForestClassifier(random_state=42) # This works better than the one above, even though it has worse accuracy

# Train the model
model.fit(X_train, y_train)

### Run Model on test

In [83]:
test = test.applymap(lambda x: pd.to_numeric(x, errors='coerce'))

test = test.drop(columns="PassengerId")

predictions = model.predict(test)

  test = test.applymap(lambda x: pd.to_numeric(x, errors='coerce'))


### Format Answer / Create CSV

In [84]:
Answer = pd.DataFrame({
    "PassengerId": test_ID,
    "Transported": predictions
})

print(Answer.head())

  PassengerId  Transported
0     0013_01         True
1     0018_01         True
2     0019_01         True
3     0021_01         True
4     0023_01         True


In [85]:
Answer.to_csv("SpaceTitanic_Answer.csv", index=False)