In [96]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split

### Retrieve and View Data

In [97]:
# Read in the train and test data
train = pd.read_csv('./spaceship-titanic/train.csv')
test = pd.read_csv('./spaceship-titanic/test.csv')
sampleSubmission = pd.read_csv('./spaceship-titanic/sample_submission.csv')

#### FIll in missing

In [98]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [99]:
print(train.head())

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  


In [100]:
print(test.head())

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0013_01      Earth      True  G/3/S  TRAPPIST-1e  27.0  False   
1     0018_01      Earth     False  F/4/S  TRAPPIST-1e  19.0  False   
2     0019_01     Europa      True  C/0/S  55 Cancri e  31.0  False   
3     0021_01     Europa     False  C/1/S  TRAPPIST-1e  38.0  False   
4     0023_01      Earth     False  F/5/S  TRAPPIST-1e  20.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck              Name  
0          0.0        0.0           0.0     0.0     0.0   Nelly Carsoning  
1          0.0        9.0           0.0  2823.0     0.0    Lerome Peckers  
2          0.0        0.0           0.0     0.0     0.0   Sabih Unhearfus  
3          0.0     6652.0           0.0   181.0   585.0  Meratz Caltilter  
4         10.0        0.0         635.0     0.0     0.0   Brence Harperez  


In [101]:
print(sampleSubmission.head())

  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01        False
3     0021_01        False
4     0023_01        False


#### Notes on Data
> Predicting if the passenger was simiply transported or not (boolean)
> This is because that is the only values in the train which isn't in the test

### Process Data

#### Notes
> Now we need to remove unneeded values and turn non-numeric values into numbers

##### Remove
- Name

##### Turn into Numbers
- Destination
- Cabin
- VIP
- CryoSleep
- HomePlanet

#### Remove Name

In [102]:
train = train.drop(columns=["Name"])
test = test.drop(columns=["Name"])

#### Destination to Number

In [103]:
uniqueDestinations = train['Destination'].unique()
print(uniqueDestinations)
# Looks like there are some missing values that I need to account for

['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' 0]


In [104]:
train['Destination'], _ = pd.factorize(train['Destination'])

In [105]:
test['Destination'], _ = pd.factorize(test['Destination'])

In [106]:
# Confirm there should be 4 different values
uniqueDestinationsNumbers = train['Destination'].unique()
print(uniqueDestinationsNumbers)

[0 1 2 3]


#### Cabin to Number

In [107]:
uniqueCabins = train['Cabin'].unique()
print(len(uniqueCabins))

6561


In [108]:
# Since there are so many different cabins and I don't know what to do yet, I will delete them. Get the best possible working for now.
train = train.drop(columns='Cabin')
test = test.drop(columns='Cabin')

#### VIP to Number

In [109]:
train['VIP'], _ = pd.factorize(train['VIP'])
test['VIP'], _ = pd.factorize(test['VIP'])

#### CryoSleep to Number

In [110]:
train['CryoSleep'], _ = pd.factorize(train['CryoSleep'])
test['CryoSleep'], _ = pd.factorize(test['CryoSleep'])

#### HomePlanet to Number

In [111]:
train['HomePlanet'], _ = pd.factorize(train['HomePlanet'])
test['HomePlanet'], _ = pd.factorize(test['HomePlanet'])

In [112]:
print(train.head())

  PassengerId  HomePlanet  CryoSleep  Destination   Age  VIP  RoomService  \
0     0001_01           0          0            0  39.0    0          0.0   
1     0002_01           1          0            0  24.0    0        109.0   
2     0003_01           0          0            0  58.0    1         43.0   
3     0003_02           0          0            0  33.0    0          0.0   
4     0004_01           1          0            0  16.0    0        303.0   

   FoodCourt  ShoppingMall     Spa  VRDeck  Transported  
0        0.0           0.0     0.0     0.0        False  
1        9.0          25.0   549.0    44.0         True  
2     3576.0           0.0  6715.0    49.0        False  
3     1283.0         371.0  3329.0   193.0        False  
4       70.0         151.0   565.0     2.0         True  


#### Split Train Data Set

In [113]:
# Shuffle the data and reset the index
train = train.sample(frac=1, random_state=42).reset_index(drop=True)


Y = train['Transported']

X = train.drop(columns='Transported')


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Train

In [114]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print(classification_report(y_test, y_pred))


### Run Model on test

In [117]:
new_data = test

predictions = model.predict(new_data)

print(predictions)


[ True False  True ...  True  True  True]


### Format Answer / Create CSV

In [122]:
Answer = test

Answer["Transported"] = predictions

columns_to_drop = Answer.columns[1:11]

Answer = Answer.drop(columns=columns_to_drop)

print(Answer.head())


  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True


In [123]:
Answer.to_csv("SpaceTitanic_Answer.csv", index=False)