<a href="https://colab.research.google.com/github/MrBigBrane/Machine-Learning/blob/main/Spaceship_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [13]:
train_data = pd.read_csv("/content/train.csv")
test_data = pd.read_csv("/content/test.csv")

## Preprocessing

In [29]:
X_train_data = train_data.drop("Transported", axis=1)
y_train_data = train_data["Transported"]

X_train_data.drop("Name", axis=1, inplace=True)

X_test_data = test_data.drop("Name", axis=1)
X_test_copy = X_test_data.copy()

In [15]:
# Checking how much of data is null (train)
for col in X_train_data.columns:
    if (X_train_data[col].isnull().sum()/X_train_data.shape[0])*100>50:
        print(col, (X_train_data[col].isnull().sum()/X_train_data.shape[0])*100)
    else:
        print(col, (X_train_data[col].isnull().sum()/X_train_data.shape[0])*100)

PassengerId 0.0
HomePlanet 2.312205222592891
CryoSleep 2.4962613597147127
Cabin 2.289198205452663
Destination 2.093638559760727
Age 2.0591280340503855
VIP 2.3352122397331185
RoomService 2.082135051190613
FoodCourt 2.105142068330841
ShoppingMall 2.392729782583688
Spa 2.105142068330841
VRDeck 2.1626596111814105


In [16]:
# Checking how much of data is null (train)
for col in X_test_data.columns:
    if (X_train_data[col].isnull().sum()/X_train_data.shape[0])*100>50:
        print(col, (X_test_data[col].isnull().sum()/X_test_data.shape[0])*100)
    else:
        print(col, (X_test_data[col].isnull().sum()/X_test_data.shape[0])*100)

PassengerId 0.0
HomePlanet 2.034136076689268
CryoSleep 2.1744213233574934
Cabin 2.338087444470423
Destination 2.1510404489127897
Age 2.127659574468085
VIP 2.1744213233574934
RoomService 1.917231704465747
FoodCourt 2.4783726911386488
ShoppingMall 2.2913256955810146
Spa 2.3614683189151275
VRDeck 1.8704699555763384


In [17]:
# Get column names that have object/string data
string_cols = X_test_data.select_dtypes(include=['object']).columns

# Get column names that have numerical data
num_cols = X_test_data.select_dtypes(include=['int64', 'float64']).columns

# Filling n/a values in columns with numerical data with median of each column
X_train_data[num_cols] = X_train_data[num_cols].fillna(X_train_data[num_cols].median())
X_test_data[num_cols] = X_test_data[num_cols].fillna(X_train_data[num_cols].median())

# Filling n/a values in columns with letter/string data with the most common value of each column
X_train_data[string_cols] = X_train_data[string_cols].fillna(X_train_data[string_cols].mode().iloc[0])
X_test_data[string_cols] = X_test_data[string_cols].fillna(X_train_data[string_cols].mode().iloc[0])

  X_train_data[string_cols] = X_train_data[string_cols].fillna(X_train_data[string_cols].mode().iloc[0])
  X_test_data[string_cols] = X_test_data[string_cols].fillna(X_train_data[string_cols].mode().iloc[0])


In [18]:
# Get columns that are boolean
bool_cols = X_test_data.select_dtypes(include=['bool']).columns

print(bool_cols)

# Convert boolean columns to integers
X_train_data[bool_cols] = X_train_data[bool_cols].astype(int)
X_test_data[bool_cols] = X_test_data[bool_cols].astype(int)

Index(['CryoSleep', 'VIP'], dtype='object')


In [19]:
X_train_data["PassengerGroup"] = X_train_data['PassengerId'].str[:4].astype(int)
X_train_data["PassengerNumber"] = X_train_data['PassengerId'].str[5:].astype(int)
X_train_data.drop("PassengerId", axis=1, inplace=True)

X_test_data["PassengerGroup"] = X_test_data['PassengerId'].str[:4].astype(int)
X_test_data["PassengerNumber"] = X_test_data['PassengerId'].str[5:].astype(int)
X_test_data.drop("PassengerId", axis=1, inplace=True)

# Update string cols to the non-boolean columns
string_cols = X_train_data.select_dtypes(include=['object']).columns

## One Hot Encoding

In [21]:
# One hot encode only the string value columns
X_train_encoded = pd.get_dummies(X_train_data, columns=string_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test_data, columns=string_cols, drop_first=True)

common_string_cols = list(set(X_train_encoded) & set(X_test_encoded.columns))
X_train_encoded = X_train_encoded[common_string_cols]
X_test_encoded = X_test_encoded[common_string_cols]

non_binary_columns_test = [
    col for col in X_test_encoded.columns if X_train_encoded[col].nunique() != 2
]
binary_columns_test = [
    col for col in X_test_encoded.columns if X_train_encoded[col].nunique() == 2
]

# Cast boolean ohe to int
X_train_encoded[binary_columns_test] = X_train_encoded[binary_columns_test].astype(int)
X_test_encoded[binary_columns_test] = X_test_encoded[binary_columns_test].astype(int)

## Data Scaling/Splitting

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X_train_encoded, y_train_data, test_size=0.2)


# Scale non-binary data columns with min max scaler (worked best)
scaler = MinMaxScaler()
X_train[non_binary_columns_test] = scaler.fit_transform(X_train[non_binary_columns_test])

X_test[non_binary_columns_test] = scaler.transform(X_test[non_binary_columns_test])

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification

# LinearSVC model
model = LinearSVC(max_iter=10000)

# Grid of parameters
param_grid = {
    'C': [10, 100, 150, 250, 500],
    'loss': ['squared_hinge'],
    'penalty': ['l2'],
    'dual': [False]  # dual=False is required for l1 penalty
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

In [None]:
best_params = grid_search.best_params_

final_model = LinearSVC(
    C=best_params['C'],
    loss=best_params['loss'],
    penalty=best_params['penalty'],
    dual=best_params['dual']
)

# Train the model
final_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = final_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## Random Forest

In [None]:
pip install tqdm

In [131]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from tqdm import tqdm

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=25,               # Number of parameter settings sampled
    scoring='accuracy',      # Metric to optimize
    cv=5,                    # 5-fold cross-validation
    verbose=1,               # Show progress
    random_state=42,         # For reproducibility
    n_jobs=-1                # Use all available CPU cores
)

# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

best_params = random_search.best_params_

print("Best Parameters:", random_search.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits




KeyboardInterrupt: 

In [122]:
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

TypeError: RandomForestClassifier.__init__() got an unexpected keyword argument 'C'

## XGBoost

In [132]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.3.1

Found existing installation: scikit-learn 1.6.0
Uninstalling scikit-learn-1.6.0:
  Successfully uninstalled scikit-learn-1.6.0
Collecting scikit-learn==1.3.1
  Downloading scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.3.1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.3.1


In [24]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.3]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}


In [25]:
best_params = grid_search.best_params_
best_xgb = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=42)
best_xgb.fit(X_train, y_train)
y_pred = best_xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.8102357676825762


In [28]:
submission_pred = best_xgb.predict(X_test_encoded)

print(submission_pred)

[1 0 1 ... 1 0 1]


In [30]:
ans=pd.DataFrame({'PassengerId':X_test_copy['PassengerId'],'Transported':submission_pred})
ans.to_csv('submission.csv',index=False)