In [29]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_roc_curve
from sklearn import svm

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
import xgboost as xgb



# Load the data
train_data = pd.read_csv("/Users/cmorr/Downloads/train.csv")
test_data = pd.read_csv("/Users/cmorr/Downloads/test.csv")

# Data preprocessing and feature engineering
def preprocess_data(df):
    # Extract group and passenger number from PassengerId
    df[['Group', 'PassengerNo']] = df['PassengerId'].str.split('_', expand=True)
        
    # Extract deck, cabin number, and side from Cabin
    df[['Deck', 'CabinNo', 'Side']] = df['Cabin'].str.split('/', expand=True)
    
    # Impute missing values
    numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    categorical_features = ['HomePlanet', 'Destination', 'Group', 'Deck', 'Side', 'Name']
    boolean_features = ['CryoSleep', 'VIP']
    
    for feature in numerical_features:
        df[feature] = df[feature].fillna(df[feature].median())
    
    for feature in categorical_features:
        df[feature] = df[feature].fillna(df[feature].mode().iloc[0])
    
    # Label encode categorical features
    le = LabelEncoder()
    for feature in categorical_features:
        df[feature] = le.fit_transform(df[feature])
     # Standardize numerical features
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    # Encode boolean features as integers
    for feature in boolean_features:
        df[feature].fillna(False, inplace=True)
        df[feature] = df[feature].astype(int)
    
    # Drop unnecessary columns
    df.drop(['PassengerId', 'Cabin', 'CabinNo', 'PassengerNo'], axis=1, inplace=True)
    
    return df

train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Split the data into training and validation sets
X = train_data.drop('Transported', axis=1)
y = train_data['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Model training and hyperparameter tuning
params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.3, 0.5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

#model = xgb.XGBClassifier(random_state=42, objective='binary:logistic', n_jobs=-1, eval_metric='auc')
#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#grid = GridSearchCV(estimator=model, param_grid=params, scoring='f1', cv=skf, verbose=2, n_jobs=-1)

#grid.fit(X_train, y_train)
#best_params = grid.best_params_

# Train the final model with the best parameters
#final_model = xgb.XGBClassifier(**best_params, random_state=42, objective='binary:logistic', n_jobs=-1, eval_metric='auc')
#final_model.fit(X_train, y_train)


# Create a logistic regression model
#model = LogisticRegression()

# Fit the model to the training data
#model.fit(X_val, y_val)

# Predict the target variable for the test data
#y_pred = model.predict(X)

#plot_roc_curve(model, X_val, y_val)

 #Evaluate the model's performance
#from sklearn.metrics import accuracy_score, confusion_matrix
#print('Accuracy:', accuracy_score(y, y_pred))
#print('Confusion matrix:', confusion_matrix(y, y_pred))

#SVM
SVM_model = svm.SVC(kernel='linear', C=3, gamma='auto')
SVM_model.fit(X_train, y_train)

accuracy = SVM_model.score(X_val, y_val)
print("Accuracy:", accuracy)










