In [12]:
# Begin by importing all required libraries
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from itertools import combinations
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import warnings


In [2]:
# Read data
train_df = pd.read_csv('csv_files/train.csv')

# Preview data
print('Raw data format:')
display(train_df.shape)

train_df.dropna(inplace=True)

train_df.drop(columns='Name', inplace=True)

# Split 'Cabin' into 'Deck' and 'Side'
train_df['Deck'] = train_df['Cabin'].str.split('/').str[0]
train_df['Side'] = train_df['Cabin'].str.split('/').str[2]

# Split the PassengerId column
train_df[['Group', 'Member']] = train_df['PassengerId'].str.split('_', expand=True)


train_df.drop(columns=['Cabin', 'PassengerId'], inplace=True) 

display(train_df.shape)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# List of columns to be label encoded
categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

# Apply LabelEncoder to each categorical column
for column in categorical_columns:
    train_df[column] = label_encoder.fit_transform(train_df[column])


train_df.head()

Raw data format:


(8693, 14)

(6606, 15)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side,Group,Member
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1,0,1,1
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,True,5,1,2,1
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,0,1,3,1
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,0,1,3,2
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,True,5,1,4,1


In [3]:
# Assuming 'df' is your DataFrame and 'features' is the list of feature column names
features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side', 'Group', 'Member']

# Convert 'Transported' to a numeric column
train_df['Transported'] = train_df['Transported'].astype(int)
train_df['Group'] = pd.to_numeric(train_df['Group'])
train_df['Member'] = pd.to_numeric(train_df['Member'])

# Best score and feature set
best_score = 0
best_features = []

train_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side,Group,Member
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,1
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,1,5,1,2,1
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,1,3,1
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,1,3,2
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,1,5,1,4,1


In [5]:
warnings.filterwarnings("ignore", category=UserWarning, module="lightgbm")

# Iterate over all possible non-empty combinations of features
for r in range(1, len(features) + 1):
    for combo in combinations(features, r):
        # Split the data
        X_train, X_valid, y_train, y_valid = train_test_split(train_df[list(combo)], train_df['Transported'], test_size=0.2, random_state=42)

        # Initialize and train the LightGBM model with verbosity turned off
        model = LGBMClassifier(verbose=-1)
        model.fit(X_train, y_train)

        # Make predictions and calculate the accuracy
        preds = model.predict(X_valid)
        score = accuracy_score(y_valid, preds)

        # Print the accuracy score for the current combination
        print(f'Combination: {combo}, Accuracy: {score}')

        # Update best score and features if current score is better
        if score > best_score:
            best_score = score
            best_features = combo

print(f'Best score: {best_score}')
print(f'Best features: {best_features}')

Combination: ('HomePlanet',), Accuracy: 0.575642965204236
Combination: ('CryoSleep',), Accuracy: 0.7223903177004538
Combination: ('Destination',), Accuracy: 0.540090771558245
Combination: ('Age',), Accuracy: 0.5741301059001512
Combination: ('VIP',), Accuracy: 0.5136157337367625
Combination: ('RoomService',), Accuracy: 0.6565809379727685
Combination: ('FoodCourt',), Accuracy: 0.6369137670196672
Combination: ('ShoppingMall',), Accuracy: 0.6232980332829047
Combination: ('Spa',), Accuracy: 0.6550680786686838
Combination: ('VRDeck',), Accuracy: 0.6649016641452344
Combination: ('Deck',), Accuracy: 0.6051437216338881
Combination: ('Side',), Accuracy: 0.5491679273827534
Combination: ('Group',), Accuracy: 0.5234493192133132
Combination: ('Member',), Accuracy: 0.5340393343419062
Combination: ('HomePlanet', 'CryoSleep'), Accuracy: 0.7223903177004538
Combination: ('HomePlanet', 'Destination'), Accuracy: 0.5816944024205749
Combination: ('HomePlanet', 'Age'), Accuracy: 0.6187594553706506
Combination

In [29]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Best features identified from your analysis
best_features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'FoodCourt', 
                 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side', 'Group']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df[best_features],  # Use only the best features for training
    train_df['Transported'],  # This is the target variable
    test_size=0.2,  # Size of the validation set
    random_state=42  # Ensures reproducibility of your train-test split
)

# Initialize the LightGBM model
model = LGBMClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
preds = model.predict(X_valid)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_valid, preds)

print(f'Validation Accuracy: {accuracy}')


[LightGBM] [Info] Number of positive: 2658, number of negative: 2626
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 5284, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503028 -> initscore=0.012112
[LightGBM] [Info] Start training from score 0.012112
Validation Accuracy: 0.829803328290469


In [30]:
# Read data
test_df = pd.read_csv('csv_files/test.csv')

unchanged = pd.read_csv('csv_files/test.csv')

# train_df.dropna(inplace=True)

test_df.drop(columns='Name', inplace=True)

# Split 'Cabin' into 'Deck' and 'Side'
test_df['Deck'] = test_df['Cabin'].str.split('/').str[0]
test_df['Side'] = test_df['Cabin'].str.split('/').str[2]

# Split the PassengerId column
test_df[['Group', 'Member']] = test_df['PassengerId'].str.split('_', expand=True)

test_df.drop(columns=['Cabin', 'PassengerId'], inplace=True) 

display(test_df.shape)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# List of columns to be label encoded
categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

# Apply LabelEncoder to each categorical column
for column in categorical_columns:
    test_df[column] = label_encoder.fit_transform(test_df[column])

test_df['Group'] = pd.to_numeric(train_df['Group'])


test_df.head()

test_df.isnull().sum()

(4277, 14)

HomePlanet         0
CryoSleep          0
Destination        0
Age               91
VIP                0
RoomService       82
FoodCourt        106
ShoppingMall      98
Spa              101
VRDeck            80
Deck               0
Side               0
Group           1040
Member             0
dtype: int64

In [31]:
# Imputers
median_imputer = SimpleImputer(strategy='median')

# List of numerical and categorical columns that need imputation
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Imputation
test_df[numerical_cols] = median_imputer.fit_transform(test_df[numerical_cols])

test_df.isnull().sum()

# Select best features from test_df for prediction
X_test = test_df[best_features]

X_test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side,Group
0,0,1,2,27.0,0.0,0.0,0.0,0.0,6,1,1.0
1,0,0,2,19.0,9.0,0.0,2823.0,0.0,5,1,2.0
2,1,1,0,31.0,0.0,0.0,0.0,0.0,2,1,3.0
3,1,0,2,38.0,6652.0,0.0,181.0,585.0,2,1,3.0
4,0,0,2,20.0,0.0,635.0,0.0,0.0,5,1,4.0


In [32]:
# Assuming test_df has been preprocessed in the same way as train_df



# Make predictions on the test set
test_predictions = model.predict(X_test)

# Convert predictions to boolean (True/False) as required
test_predictions_bool = test_predictions.astype(bool)

# Create a DataFrame with 'PassengerId' and 'Transported'
output_df = pd.DataFrame({
    'PassengerId': unchanged['PassengerId'],
    'Transported': test_predictions_bool
})

# Display the formatted output
print(output_df.head())

# If you need to save this output to a CSV file
output_df.to_csv('predictions.csv', index=False)


  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True
