In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load data
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Save the target variable and passenger IDs from test set for later use
train_target = train_data['Transported']
test_passenger_ids = test_data['PassengerId']

# Combine train and test data for feature engineering
combined_data = pd.concat([train_data.drop('Transported', axis=1), test_data], ignore_index=True)

# Feature Engineering function
def feature_engineering(data):
    # Handle missing values for numeric columns
    numeric_cols = data.select_dtypes(include=['number']).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

    # Process Cabin column (extracting deck, num, and side)
    data['Deck'] = data['Cabin'].str.split('/', expand=True)[0]
    data['Num'] = data['Cabin'].str.split('/', expand=True)[1]
    data['Side'] = data['Cabin'].str.split('/', expand=True)[2]
    data.drop('Cabin', axis=1, inplace=True)

    # Handle categorical features with one-hot encoding
    data_encoded = pd.get_dummies(data, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side'], drop_first=True)

    # Convert boolean to numeric
    bool_columns = data_encoded.select_dtypes(include=bool).columns
    data_encoded[bool_columns] = data_encoded[bool_columns].astype(int)

    # Scale numeric features
    scaler = MinMaxScaler()
    data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = scaler.fit_transform(
        data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])

    # Drop unnecessary columns
    data_encoded = data_encoded.drop(['PassengerId', 'Name', 'Num'], axis=1)
    
    return data_encoded

# Apply feature engineering to combined data
combined_data_processed = feature_engineering(combined_data)

# Split back into train and test sets
train_data_processed = combined_data_processed[:len(train_data)].copy()
test_data_processed = combined_data_processed[len(train_data):].copy()

# Add the target variable back to the train set
train_data_processed['Transported'] = train_target

# Add the target variable back to the train set
train_data_processed['Transported'] = train_target

# Model training and evaluation
X = train_data_processed.drop('Transported', axis=1)
y = train_data_processed['Transported']

# Splitting the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree model with default parameters
model = DecisionTreeClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
print(f'Accuracy Score with Decision Tree: {accuracy_score(y_val, y_pred)}')

# Make predictions on the test set
test_predictions = model.predict(test_data_processed)

# Prepare submission file
submission = pd.DataFrame({'PassengerId': test_passenger_ids, 'Transported': test_predictions})
submission.to_csv('submission.csv', index=False)
print(submission.dtypes)
print(submission.head(5))

Accuracy Score with Decision Tree: 0.7320299022426682
PassengerId    object
Transported      bool
dtype: object
  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import numpy as np

# Load data
train_data = pd.read_csv('/Users/laurie/Library/CloudStorage/OneDrive-HKUST(Guangzhou)/hkust-gz/23fall/DSAA6000G/SPACESHIP-TITANIC/data/train.csv')
test_data = pd.read_csv('/Users/laurie/Library/CloudStorage/OneDrive-HKUST(Guangzhou)/hkust-gz/23fall/DSAA6000G/SPACESHIP-TITANIC/data/test.csv')

# Save the target variable and passenger IDs from test set for later use
train_target = train_data['Transported']
test_passenger_ids = test_data['PassengerId']

# Combine train and test data for feature engineering
combined_data = pd.concat([train_data.drop('Transported', axis=1), test_data], ignore_index=True)

# Feature Engineering function
def feature_engineering(data):
    # Handle missing values for numeric columns
    numeric_cols = data.select_dtypes(include=['number']).columns
    data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

    # Process Cabin column (extracting deck, num, and side)
    data['Deck'] = data['Cabin'].str.split('/', expand=True)[0]
    data['Num'] = data['Cabin'].str.split('/', expand=True)[1]
    data['Side'] = data['Cabin'].str.split('/', expand=True)[2]
    data.drop('Cabin', axis=1, inplace=True)

    # Handle categorical features with one-hot encoding
    data_encoded = pd.get_dummies(data, columns=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side'], drop_first=True)

    # Convert boolean to numeric
    bool_columns = data_encoded.select_dtypes(include=bool).columns
    data_encoded[bool_columns] = data_encoded[bool_columns].astype(int)

    # Scale numeric features
    scaler = MinMaxScaler()
    data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = scaler.fit_transform(
        data_encoded[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])

    # Drop unnecessary columns
    data_encoded = data_encoded.drop(['PassengerId', 'Name', 'Num'], axis=1)
    
    return data_encoded

# Apply feature engineering to combined data
combined_data_processed = feature_engineering(combined_data)

# Split back into train and test sets
train_data_processed = combined_data_processed[:len(train_data)].copy()
test_data_processed = combined_data_processed[len(train_data):].copy()

# Add the target variable back to the train set
train_data_processed['Transported'] = train_target

# Add the target variable back to the train set
train_data_processed['Transported'] = train_target

# Model training and evaluation
X = train_data_processed.drop('Transported', axis=1)
y = train_data_processed['Transported']

# Splitting the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree model with default parameters
model = DecisionTreeClassifier(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
print(f'Accuracy Score with Decision Tree: {accuracy_score(y_val, y_pred)}')

# Make predictions on the test set
test_predictions = model.predict(test_data_processed)

# Prepare submission file
submission = pd.DataFrame({'PassengerId': test_passenger_ids, 'Transported': test_predictions})
submission.to_csv('submission.csv', index=False)
print(submission.dtypes)
print(submission.head(5))

Accuracy Score with Decision Tree: 0.7320299022426682
PassengerId    object
Transported      bool
dtype: object
  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True


  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
  array.dtypes.apply(is_sparse).any()):
