In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
print("First five rows of the data:")
print(spaceship.head())

# Check the shape of the data
print("\nData shape:", spaceship.shape)

# Check for data types
print("\nData types:")
print(spaceship.dtypes)

# Check for missing values
print("\nMissing values per column:")
print(spaceship.isnull().sum())

# Since the number of missing values is low, drop all rows with any missing value
spaceship_clean = spaceship.dropna()
print("\nData shape after dropping rows with missing values:", spaceship_clean.shape)

# Select only numerical columns as features.
# For this dataset, we will use the following numerical features: Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck.
X = spaceship_clean[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]

# Define the target variable.
# Transported is a boolean indicating whether the passenger was transported or not.
y = spaceship_clean['Transported']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Since our target variable is binary, we choose KNeighborsClassifier.
knn = KNeighborsClassifier()

# Fit the KNN model to the training data
knn.fit(X_train, y_train)

# Evaluate the model on the test set using accuracy as the metric
accuracy = knn.score(X_test, y_test)
print("\nModel accuracy:", accuracy)


First five rows of the data:
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3      