In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [3]:
train_data = pd.read_csv('spaceship_train.csv')
test_data = pd.read_csv('spaceship_test.csv')

In [13]:
def preprocess_data(df, is_train = True):
    if 'Cabin' in df.columns:
        df[['Deck','Num', 'Side']] = df['Cabin'].str.split('/',expand = True)
        df.drop(columns = ['Cabin'], inplace = True)
        
    for col in ['HomePlanet', 'Destination','Deck', 'Side']:
        df[col].fillna(df[col].mode()[0], inplace = True)
    for col in ['Age','RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
        df[col].fillna(df[col].median(), inplace = True)
    df['CryoSleep'].fillna(False, inplace = True)
    df['VIP'].fillna(False, inplace = True)
    
    label_cols = ['HomePlanet', 'Destination','Deck', 'Side']
    for col in label_cols:
        df[col] = LabelEncoder().fit_transform(df[col])
    
    df['Num'] = pd.to_numeric(df['Num'], errors = 'coerce')
    df['Num'].fillna(df['Num'].median(),inplace = True)
    
    if is_train:
        df.drop(columns = ['Name','PassengerId'], inplace = True)
    else:
        df.drop(columns = ['Name'], inplace = True)
    
    return df

In [14]:
train_data1 = preprocess_data(train_data.copy())
test_data1 = preprocess_data(test_data.copy(), is_train = False)

In [15]:
X = train_data1.drop(columns = ['Transported'])
y = train_data1['Transported'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)


In [27]:
rf_model = RandomForestClassifier(random_state = 42, n_estimators = 500)
gb_model = GradientBoostingClassifier(random_state = 42, n_estimators = 500)
xgb_model = XGBClassifier(random_state = 42, n_estimators = 500, use_label_encoder = False,eval_metric = 'logloss')

In [28]:
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [29]:
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)

In [30]:
rf_accuracy = accuracy_score(y_test, rf_pred)
gb_accuracy = accuracy_score(y_test, gb_pred)
xgb_accuracy = accuracy_score(y_test, xgb_pred)

In [31]:
print("Random Forest Accuracy:", rf_accuracy)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("XGradient Boosting Accuracy:", xgb_accuracy)

Random Forest Accuracy: 0.799079754601227
Gradient Boosting Accuracy: 0.7914110429447853
XGradient Boosting Accuracy: 0.7944785276073619


In [35]:
best_model = max([(rf_model, rf_accuracy), (gb_model, gb_accuracy), (xgb_model, xgb_accuracy)], key=lambda x: x[1])[0]
test_preds = best_model.predict(test_data1.drop(columns=['PassengerId']))


In [36]:
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Transported': test_preds.astype(bool)})
submission.to_csv('submission1.csv', index=False)