## 

##  Load data

In [None]:
import pandas as pd

# 1. 加载数据
train_data = pd.read_csv('/Users/laurie/Library/CloudStorage/OneDrive-HKUST(Guangzhou)/hkust-gz/23fall/DSAA6000G/SPACESHIP-TITANIC/data/train.csv')
print(train_data.dtypes)
print(train_data.head())


test_data = pd.read_csv('/Users/laurie/Library/CloudStorage/OneDrive-HKUST(Guangzhou)/hkust-gz/23fall/DSAA6000G/SPACESHIP-TITANIC/data/test.csv')
print(test_data.dtypes)
print(test_data.head())

## Step 1: Data Cleaning and Preparation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Handling missing values
train_data.fillna(train_data.mean(), inplace=True)
test_data.fillna(test_data.mean(), inplace=True)

# Handling categorical features
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

## Step 2: Feature Engineering

In [None]:
# Function for feature engineering
def feature_engineering(data, scaler=None):
    # Handle missing values for numeric columns
    numeric_cols = data.select_dtypes(include=['number']).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

    # Handle categorical features with one-hot encoding
    data_encoded = pd.get_dummies(data, columns=['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name'], drop_first=True)

    # Convert boolean to numeric
    bool_columns = data_encoded.select_dtypes(include=bool).columns
    data_encoded[bool_columns] = data_encoded[bool_columns].astype(int)

    # Scale numeric features
    if scaler is None:
        scaler = MinMaxScaler()
        data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = scaler.fit_transform(
            data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])
    else:
        data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = scaler.transform(
            data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])
    
    return data_encoded, scaler

In [None]:
# Perform feature engineering
train_data_encoded, scaler = feature_engineering(train_data)

## Step 3: Splitting the dataset into train and validation sets

In [None]:
X = train_data.drop(['Transported'], axis=1)
y = train_data['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 4: Model Building and Training

In [None]:
# Function to train and evaluate a model
def train_and_evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    return model, val_accuracy

In [None]:
# RandomForest Model
clf_rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [5],
    'max_depth': [5],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}
grid_search_rf = GridSearchCV(clf_rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf_clf = grid_search_rf.best_estimator_

# Decision Tree with Hyperopt
def hyperoptimize_decision_tree(X_train, y_train):
    space = {
        'max_depth': hp.choice('max_depth', range(1, 20)),
        'min_samples_split': hp.uniform('min_samples_split', 0.1, 1.0),
        'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 5))
    }

    def objective(params):
        clf = DecisionTreeClassifier(**params)
        score = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5).mean()
        return -score

    best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=Trials())
    return DecisionTreeClassifier(**best)

best_dt_clf = hyperoptimize_decision_tree(X_train, y_train)
best_dt_clf.fit(X_train, y_train)

# Naive Bayes Model
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

# XGBoost Model
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)


## Step 5: Prediction and Model Evaluation

In [None]:
# Evaluate RandomForest
y_val_pred_rf = best_rf_clf.predict(X_val)
val_accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
print(f'RandomForest Validation Accuracy: {val_accuracy_rf}')

# Evaluate Decision Tree
y_val_pred_dt = best_dt_clf.predict(X_val)
val_accuracy_dt = accuracy_score(y_val, y_val_pred_dt)
print(f'Decision Tree Validation Accuracy


## Step 6: Generating predictions on the test set and saving as a submission file

In [None]:
test_predictions = model.predict(test_data)
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Transported': test_predictions})
submission.to_csv('submission.csv', index=False)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load data
data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
print(data.dtypes)
print(data.head())

# Feature Engineering Steps

# Handling missing values
# Handling missing values for numeric columns
numeric_cols = data.select_dtypes(include=['number']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Handling categorical features with one-hot encoding
data_encoded = pd.get_dummies(data, columns=['PassengerId','HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP','Name'], drop_first=True)
print(data_encoded.dtypes)
print(data_encoded.head())

# Convert boolean to numeric
bool_columns = data_encoded.select_dtypes(include=bool).columns
data_encoded[bool_columns] = data_encoded[bool_columns].astype(int)


# Scale numeric features
scaler = MinMaxScaler()
data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = scaler.fit_transform(
    data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])


print(data_encoded.dtypes)
print(data_encoded.head())


# Splitting the dataset into training and testing sets
X = data_encoded.drop('Transported', axis=1)  # Features
y = data_encoded['Transported']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Initializing classifier 
clf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [5],
    'max_depth': [7],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

# Cross-validation with the best hyperparameters
best_clf = grid_search.best_estimator_
cv_scores = cross_val_score(best_clf, X, y, cv=5, scoring='accuracy')
print(f'Cross-validated Accuracy with HPO: {cv_scores.mean()}')

# Training the classifier with the best hyperparameters
best_clf.fit(X_train, y_train)

# Making predictions on the test set
y_pred = best_clf.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on Test Set: {accuracy}')

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juann

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

# Load training data
train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

print(train_data.dtypes)
print(train_data.head(10))

# Feature Engineering Steps
# Handle missing values for numeric columns
numeric_cols = train_data.select_dtypes(include=['number']).columns
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].mean())

# Handle categorical features with one-hot encoding
train_data_encoded = pd.get_dummies(train_data, columns=['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name'], drop_first=True)

# Convert boolean to numeric
bool_columns = train_data_encoded.select_dtypes(include=bool).columns
train_data_encoded[bool_columns] = train_data_encoded[bool_columns].astype(int)

# Scale numeric features
scaler = MinMaxScaler()
train_data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = scaler.fit_transform(
    train_data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])

print(train_data_encoded.dtypes)
print(train_data_encoded.head(10))

# Splitting the dataset into training and validation sets
X = train_data_encoded.drop('Transported', axis=1)  # Features
y = train_data_encoded['Transported']  # Target variable
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing RandomForest classifier
clf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [5],
    'max_depth': [5],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

# Training the classifier with the best hyperparameters
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

# Making predictions on the validation set
y_val_pred = best_clf.predict(X_val)

# Calculating accuracy on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy on Validation Set: {val_accuracy}')


# Decision Tree with Hyperopt for hyperparameter tuning
space = {
    'max_depth': hp.choice('max_depth', range(1, 20)),
    'min_samples_split': hp.uniform('min_samples_split', 0.1, 1.0),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 5))
}

def objective(params):
    clf = DecisionTreeClassifier(**params)
    score = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5).mean()
    return -score

best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=Trials())
best_clf = DecisionTreeClassifier(**best)
best_clf.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = best_clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Decision Tree Validation Accuracy: {val_accuracy}')

# Naive Bayes Model
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
y_val_pred_nb = nb_clf.predict(X_val)
nb_val_accuracy = accuracy_score(y_val, y_val_pred_nb)
print(f'Naive Bayes Validation Accuracy: {nb_val_accuracy}')

# XGBoost for comparison
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_val_pred_xgb = xgb_clf.predict(X_val)
xgb_val_accuracy = accuracy_score(y_val, y_val_pred_xgb)
print(f'XGBoost Validation Accuracy: {xgb_val_accuracy}')



# Load test data
test_data = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Preprocess the test data

# Fill missing values for numeric columns in test data
test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].mean())

# Apply one-hot encoding to categorical features in test data
# Ensure that test data has the same dummy variables as the train data
test_data_encoded = pd.get_dummies(test_data, columns=['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name'], drop_first=True)
test_data_encoded = test_data_encoded.reindex(columns=train_data_encoded.columns, fill_value=0)

# Convert boolean to numeric in test data
bool_columns_test = test_data_encoded.select_dtypes(include=bool).columns
test_data_encoded[bool_columns_test] = test_data_encoded[bool_columns_test].astype(int)

# Scale numeric features in test data
# Note: Use the same scaler object used for the train data
test_data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = scaler.transform(
    test_data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])

# Ensure that the test data has the same features as the training data
test_data_processed = test_data_encoded[X_train.columns]

# Making predictions on the test set
test_predictions = best_clf.predict(test_data_processed)

# Creating the submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions.astype(bool)
})
submission.to_csv('submission.csv', index=False)

print(submission.dtypes)
print(submission.head(10))


PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object
  PassengerId HomePlanet CryoSleep  Cabin    Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P    TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S    TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S    TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S    TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S    TRAPPIST-1e  16.0  False   
5     0005_01      Earth     False  F/0/P  PSO J318.5-22  44.0  False   
6     0006_01      Earth     False  F/2/S    TRAPPIST-1e  26.0  False   
7     0006_02      Earth      True  G/0/S    TRAPPIST-1e  28.0  Fa

NameError: name 'hp' is not defined