In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the training dataset
train_file_path = 'train.csv'
train_data = pd.read_csv(train_file_path)

# Load the test dataset
test_file_path = 'test.csv'
test_data = pd.read_csv(test_file_path)

# Drop unnecessary columns from the training and test datasets
train_data=train_data.drop(columns=['Name','PassengerId','Cabin'])
test_data_1=test_data.drop(columns=['Name','PassengerId','Cabin'])

# Assuming 'Transported' is the target variable
target = 'Transported'
features = train_data.drop(columns=[target])
labels = train_data[target]

# Identify categorical and numerical columns
categorical_cols = features.select_dtypes(include=['object']).columns
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Create a pipeline that includes preprocessing and the logistic regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=40, n_estimators=100))
])

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
X_train = pipeline.named_steps['preprocessor'].fit_transform(X_train)
X_val = pipeline.named_steps['preprocessor'].transform(X_val)
pipeline.named_steps['classifier'].fit(X_train,y_train)

# Make predictions
y_pred = pipeline.named_steps['classifier'].predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)


# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


# Preprocess the test data
test_data_preprocessed = pipeline.named_steps['preprocessor'].transform(test_data_1)
# Make predictions on the preprocessed test data
test_predictions = pipeline.named_steps['classifier'].predict(test_data_preprocessed)

Accuracy: 0.7786083956296722
Classification Report:
              precision    recall  f1-score   support

       False       0.79      0.76      0.77       861
        True       0.77      0.80      0.78       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



In [11]:
test_predictions

array([ True, False,  True, ...,  True,  True, False])

In [12]:
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})
# Convert the Transported column to boolean (if it's not already)
submission['Transported'] = submission['Transported'].astype(bool)

# Save the submission DataFrame to a CSV file
submission_file_path = 'submission.csv'
submission.to_csv(submission_file_path, index=False)

print(f'Submission file saved to {submission_file_path}')

Submission file saved to submission.csv


In [13]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
