In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict, StratifiedKFold

In [2]:
df = pd.read_csv(r"C:\Python 2k25\datasets\titanic\train.csv")
print("Dataset has been loaded and cleaned! \n", df.head(5))

Dataset has been loaded and cleaned! 
    PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0        

In [None]:
df['Embarked'].dtypes
df['Embarked'].unique

X = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = df['Survived']
df.isnull().sum()
df.isna().sum()

In [None]:
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

scaled_num = Pipeline([
    ('impute', SimpleImputer(strategy='median'))
    # ('scaler', StandardScaler())
])

scaled_cat = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocess = ColumnTransformer(transformers=[
    ('num', scaled_num, num_features),
    ('cat', scaled_cat, cat_features)
])
print('Numerical: ', num_features)
print('Catagory: ', cat_features)

pipe = Pipeline([
    ('pre', preprocess),
    ('model', RandomForestClassifier(random_state =42, n_jobs=-1))
])

In [None]:


parameters = [
    {
        'model__n_estimators': [300, 500, 800, 1000, 1200],
        'model__max_depth': [None, 8, 10, 12, 15, 20],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['sqrt', 'log2', None]  # Try None too
}
]
grid = GridSearchCV(
    pipe, parameters, scoring = 'accuracy', cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
)

grid.fit(X, y)
print('Model has been trained successfully!')

print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_score_)

In [None]:
import joblib
joblib.dump(grid.best_estimator_, 'titanic_model_pipeline.pkl')

In [None]:
df_test = pd.read_csv(r"C:\Python 2k25\datasets\titanic\test.csv")
X_test = df_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

predictions = grid.best_estimator_.predict(X_test)
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission.csv', index=False)

In [None]:
        #####========run this code comment out the other grid part as grid takes 9-10 hours to run and save best model while this code 
        # has been defined with best found parameters without use of grid and comes to business end staright aw3ay by saving 
        # the model, after saving the model you can comment it out and uncomment the grid part to show that grid has 
        # got theswe results as the model won't mismatch or contradict at all



# # 2. Create features and target
# # Drop columns that are not useful for prediction
# X = df.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
# y = df['Survived']

# # 3. Define numeric and categorical features
# num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
# cat_features = X.select_dtypes(include=['object']).columns.tolist()

# # 4. Preprocessing pipelines
# numeric_pipeline = Pipeline([
#     ('impute', SimpleImputer(strategy='median'))  # No StandardScaler needed for RF
# ])

# categorical_pipeline = Pipeline([
#     ('impute', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
# ])

# # 5. ColumnTransformer to apply preprocessing separately
# preprocess = ColumnTransformer([
#     ('num', numeric_pipeline, num_features),
#     ('cat', categorical_pipeline, cat_features)
# ])

# # 6. Final pipeline with the best parameters from your GridSearchCV
# final_pipe = Pipeline([
#     ('pre', preprocess),
#     ('model', RandomForestClassifier(
#         n_estimators=300,
#         max_depth=None,
#         max_features=None,
#         min_samples_leaf=1,
#         min_samples_split=10,
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

# # 7. Fit the pipeline on the full training data
# final_pipe.fit(X, y)

# print("Model trained successfully on full data!")

# # 8. Save the complete pipeline (preprocessing + model)
# import joblib
# joblib.dump(final_pipe, 'titanic_model_pipeline.pkl')

# print("Model saved as 'titanic_model_pipeline.pkl'")