### LR Parameter Selection and Submission

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Drop unnecessary columns
train_data = train_data.drop(columns=['Name', 'PassengerId', 'Cabin'])
test_data_1 = test_data.drop(columns=['Name', 'PassengerId', 'Cabin'])

# Separate features and target
features = train_data.drop(columns=['Transported'])
labels = train_data['Transported']

# Identify categorical and numerical columns
categorical_cols = features.select_dtypes(include=['object']).columns
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Create a pipeline that includes preprocessing and the logistic regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Parameters of the model to be optimized by GridSearchCV
param_grid = {
    'classifier__C': [0.1, 1, 10],  # Regularization parameter
    'classifier__solver': ['liblinear', 'saga']  # Algorithm to use in the optimization problem
}

# Setup the grid search with 10-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Run grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Evaluate the model with the best parameters on the validation set
y_pred = grid_search.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f'Validation Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

# Preprocess and predict on the test set
test_predictions = grid_search.predict(test_data_1)

Best parameters: {'classifier__C': 10, 'classifier__solver': 'liblinear'}
Best cross-validation score: 0.79
Validation Accuracy: 0.7757331799884991
Classification Report:
              precision    recall  f1-score   support

       False       0.79      0.74      0.77       861
        True       0.76      0.81      0.78       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



In [3]:
test_predictions

array([ True, False,  True, ...,  True,  True,  True])

In [6]:
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})
# Convert the Transported column to boolean (if it's not already)
submission['Transported'] = submission['Transported'].astype(bool)

# Save the submission DataFrame to a CSV file
submission_file_path = 'submission3.csv'
submission.to_csv(submission_file_path, index=False)

print(f'Submission file saved to {submission_file_path}')

Submission file saved to submission3.csv


In [7]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
