In [1]:
import pandas as pd

# Load the sample submission file
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()


Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [3]:
# Load the training data
train_data = pd.read_csv('train.csv')
train_data.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# Load the testing data
test_data = pd.read_csv('test.csv')
test_data.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


Start dataprocessing:

In [5]:
# Checking for missing values in the training data
train_data.isnull().sum()
# Checking for missing values in the test data
test_data.isnull().sum()


PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

Do imputations:

In [6]:
# List of categorical columns for mode imputation
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

# List of numerical columns for median imputation
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Perform mode imputation on categorical columns
for col in categorical_cols:
    mode = train_data[col].mode()[0]
    train_data[col].fillna(mode, inplace=True)
    test_data[col].fillna(mode, inplace=True)

# Perform median imputation on numerical columns
for col in numerical_cols:
    median = train_data[col].median()
    train_data[col].fillna(median, inplace=True)
    test_data[col].fillna(median, inplace=True)
# Check if all missing values have been filled in the training data
train_data.isnull().sum()
# Check if all missing values have been filled in the test data
test_data.isnull().sum()
# Checking the number of unique categories in each categorical column
for col in categorical_cols:
    print(f"{col}: {train_data[col].nunique()} unique values")


HomePlanet: 3 unique values
CryoSleep: 2 unique values
Cabin: 6560 unique values
Destination: 3 unique values
VIP: 2 unique values


One hot encoding:

In [7]:
# Feature engineering: Extract the first letter from the Cabin column
train_data['CabinSection'] = train_data['Cabin'].apply(lambda x: x[0])
test_data['CabinSection'] = test_data['Cabin'].apply(lambda x: x[0])

# Update the list of categorical columns
categorical_cols = ['HomePlanet', 'CryoSleep', 'CabinSection', 'Destination', 'VIP']

# One-hot encoding of categorical columns
train_data_encoded = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# Print the first few rows of the encoded training data
train_data_encoded.head()


# Drop the 'Name' and 'Cabin' columns from the training and testing datasets
train_data_encoded.drop(['Name', 'Cabin'], axis=1, inplace=True)
test_data_encoded.drop(['Name', 'Cabin'], axis=1, inplace=True)

# Separate the target variable from the training data
X_train = train_data_encoded.drop('Transported', axis=1)
y_train = train_data_encoded['Transported']

# The test data is already separate
X_test = test_data_encoded

# Print the first few rows of the processed training data
X_train.head()


Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,CabinSection_B,CabinSection_C,CabinSection_D,CabinSection_E,CabinSection_F,CabinSection_G,CabinSection_T,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,True,False,False,True,False,False,False,False,False,False,False,True,False
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,False,False,False,False,False,False,False,True,False,False,False,True,False
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,True,False,False,False,False,False,False,False,False,False,False,True,True
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,True,False,False,False,False,False,False,False,False,False,False,True,False
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,False,False,False,False,False,False,False,True,False,False,False,True,False


Modelling part:

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform 5-fold cross-validation to estimate the performance of the model
cv_scores = cross_val_score(rf, X_train.drop('PassengerId', axis=1), y_train, cv=5, scoring='accuracy')

# Print the mean cross-validation score
cv_scores.mean()


0.7898323243057959

Fit model:

In [12]:
# Fit the model to the entire training data
rf.fit(X_train.drop('PassengerId', axis=1), y_train)

# Make predictions on the test data
predictions = rf.predict(X_test.drop('PassengerId', axis=1))

# Create the submission dataframe
submission = pd.DataFrame({
    'PassengerId': X_test['PassengerId'],
    'Transported': predictions
})

# Print the first few rows of the submission dataframe
submission.head()


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


Save submission file:

In [13]:
# Save the submission dataframe to a CSV file
submission.to_csv('spaceship_titanic_submission.csv', index=False)
