In [20]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [21]:
# Retrieve Training and Testing Data 
df_train = pd.read_csv('data/trainClean.csv')
df_test = pd.read_csv('data/testClean.csv')

# Setup features and target
X_train = df_train.drop(columns='Transported')
y_train = df_train['Transported']

# Preprocessing - Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Define the logistic regression model
logistic_regression = LogisticRegression()

In [22]:
# Define hyperparameters for tuning
param_grid = {
    'penalty': ['l1', 'l2', 'None'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'max_iter': [100, 200, 300, 400, 500]
}

# Instantiate GridSearchCV tuner
grid_search = GridSearchCV(
    logistic_regression,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)

In [23]:
# Perform the hyperparameter search
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2'}


350 fits failed out of a total of 525.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
175 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\tjala\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\tjala\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\tjala\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.

In [24]:
# Build the logistic regression model using the best hyperparameters
best_model = LogisticRegression(
    penalty=best_params['penalty'],
    C=best_params['C'],
    max_iter=best_params['max_iter']
)

# Train the best model using all training data
best_model.fit(X_train_scaled, y_train)

# Prepare test data for predictions
X_test_scaled = scaler.transform(df_test)

# Predict using the best model on test data
predictions = best_model.predict(X_test_scaled)
pred = (predictions).astype(int)

In [25]:
#CSV containing predictions
tfMap = { 0: False, 1: True }

kaggleCSV = pd.DataFrame(columns=['PassengerId', 'Transported'])
kaggleCSV['PassengerId'] = df_test['PassengerId']
kaggleCSV['Transported'] = [tfMap[i] for i in pred]

kaggleCSV.to_csv('results/logistic_results.csv', index=False)