In [35]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the cleaned datasets
train_df = pd.read_csv('cleaned_train.csv')
test_df = pd.read_csv('cleaned_test.csv')

In [36]:
# Check for missing values in the test dataset
print(X_test.isnull().sum())

Pclass           0
Sex_male         0
Title_Miss       0
Title_Mr         0
Title_Mrs        0
Title_Officer    0
Title_Royalty    0
Age              0
Fare             1
FamilySize       0
Has_Cabin        0
Embarked_Q       0
Embarked_S       0
dtype: int64


In [37]:
# Define features and target
features = ['Pclass', 'Sex_male', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty', 'Age', 'Fare', 'FamilySize', 'Has_Cabin', 'Embarked_Q', 'Embarked_S']
target = 'Survived'

# Extract features and target from the training data
X_train_full = train_df[features]
y_train_full = train_df[target]

# Extract features from the test data (no target)
X_test = test_df[features]

In [38]:
# Define preprocessing for numeric and categorical features
numeric_features = ['Age', 'Fare', 'FamilySize']
categorical_features = ['Pclass', 'Sex_male', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty', 'Embarked_Q', 'Embarked_S']

# Create a ColumnTransformer to preprocess both numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', 'passthrough', categorical_features)
    ]
)

# Apply preprocessing to both training and test data
X_train_processed = preprocessor.fit_transform(X_train_full)
X_test_processed = preprocessor.transform(X_test)

In [39]:
# Split the training data into training and validation subsets
X_train, X_val, y_train, y_val = train_test_split(X_train_processed, y_train_full, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)

Training set size: (712, 12)
Validation set size: (179, 12)


In [40]:
# Initialize and train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)
print("Validation Accuracy score:", accuracy_score(y_val, y_val_pred))
print("Validation Classification report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy score: 0.8324022346368715
Validation Classification report:
               precision    recall  f1-score   support

           0       0.87      0.84      0.85       105
           1       0.78      0.82      0.80        74

    accuracy                           0.83       179
   macro avg       0.83      0.83      0.83       179
weighted avg       0.83      0.83      0.83       179



In [41]:
from sklearn.impute import SimpleImputer

# Define preprocessing for numeric and categorical features
numeric_features = ['Age', 'Fare', 'FamilySize']
categorical_features = ['Pclass', 'Sex_male', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty', 'Embarked_Q', 'Embarked_S']

# Create a ColumnTransformer with imputation and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Impute missing numeric values
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

# Apply preprocessing to both training and test data
X_train_processed = preprocessor.fit_transform(X_train_full)
X_test_processed = preprocessor.transform(X_test)

In [42]:
# Split the training data into training and validation subsets
X_train, X_val, y_train, y_val = train_test_split(X_train_processed, y_train_full, test_size=0.2, random_state=42)

# Initialize and train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)
print("Validation Accuracy score:", accuracy_score(y_val, y_val_pred))
print("Validation Classification report:\n", classification_report(y_val, y_val_pred))

# Make predictions on the test set
y_test_pred = model.predict(X_test_processed)

# Create a DataFrame to hold predictions for submission
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': y_test_pred})

# Save the predictions to a CSV file
submission_df.to_csv('submission.csv', index=False)

Validation Accuracy score: 0.8268156424581006
Validation Classification report:
               precision    recall  f1-score   support

           0       0.87      0.83      0.85       105
           1       0.77      0.82      0.80        74

    accuracy                           0.83       179
   macro avg       0.82      0.83      0.82       179
weighted avg       0.83      0.83      0.83       179



The dataset includes categorical data like passenger class, gender (coded as `Sex_male`), and titles (`Title_Miss`, `Title_Mr`, `Title_Mrs`, etc.). These categories were turned into numerical values so they can be used in the model. The dataset also has a mix of ranges. Categorical features are either 0 or 1. Numeric features like age and fare had different ranges but were standardized to make all values similar, helping the model work better.