In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
gender_submission_df = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

# Display the first few rows of the datasets
print("Train Dataset Overview:")
display(train_df.head())
print("Test Dataset Overview:")
display(test_df.head())


In [None]:
# Create a summary of missing values for train and test datasets
missing_summary = pd.DataFrame({
    'Missing Values (Train)': missing_train,
    'Missing Values (Test)': missing_test
}).sort_values(by='Missing Values (Train)', ascending=False)

# Reset the index for better readability
missing_summary.reset_index(inplace=True)
missing_summary.rename(columns={'index': 'Feature'}, inplace=True)

# Display the missing values summary
print("Missing Values Summary:")
display(missing_summary)


In [None]:
# Step 1: Create a new binary column `HasCabin`
train_df['HasCabin'] = train_df['Cabin'].notnull().astype(int)
test_df['HasCabin'] = test_df['Cabin'].notnull().astype(int)

# Step 2: Drop the original `Cabin` column
train_df.drop(columns=['Cabin'], inplace=True)
test_df.drop(columns=['Cabin'], inplace=True)

# Verify changes
print("Updated Train Dataset:")
display(train_df.head())

print("Updated Test Dataset:")
display(test_df.head())


In [None]:
# Step 1: Define a function to fill missing age values based on Pclass and Sex
def fill_missing_age(df):
    df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
    return df

# Step 2: Apply the function to both train and test datasets
train_df = fill_missing_age(train_df)
test_df = fill_missing_age(test_df)

# Step 3: Verify changes
print("Missing values in 'Age' (Train):", train_df['Age'].isnull().sum())
print("Missing values in 'Age' (Test):", test_df['Age'].isnull().sum())


In [None]:
# Step 1: Fill missing values in 'Embarked' with the mode
embarked_mode = train_df['Embarked'].mode()[0]
train_df['Embarked'].fillna(embarked_mode, inplace=True)

# Step 2: Verify the changes
print("Missing values in 'Embarked' (Train):", train_df['Embarked'].isnull().sum())


In [None]:
# Step 1: Fill missing value in 'Fare' with the median
fare_median = test_df['Fare'].median()
test_df['Fare'].fillna(fare_median, inplace=True)

# Step 2: Verify the changes
print("Missing values in 'Fare' (Test):", test_df['Fare'].isnull().sum())


In [None]:
# Step 1: Encode 'Sex' as binary
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# Step 2: One-hot encode 'Embarked'
train_df = pd.get_dummies(train_df, columns=['Embarked'], prefix='Embarked')
test_df = pd.get_dummies(test_df, columns=['Embarked'], prefix='Embarked')

# Ensure both datasets have the same columns
missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0
test_df = test_df[train_df.columns.drop('Survived')]

# Verify changes
print("Encoded Train Dataset:")
display(train_df.head())

print("Encoded Test Dataset:")
display(test_df.head())


In [None]:
# Step 1: Extract titles from 'Name'
train_df['Title'] = train_df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)

# Step 2: Group rare titles together
title_mapping = {
    "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs", 
    "Lady": "Royalty", "Countess": "Royalty", "Sir": "Royalty", "Jonkheer": "Royalty", "Don": "Royalty", "Dona": "Royalty", 
    "Capt": "Officer", "Col": "Officer", "Major": "Officer", "Dr": "Officer", "Rev": "Officer"
}
train_df['Title'] = train_df['Title'].replace(title_mapping)
test_df['Title'] = test_df['Title'].replace(title_mapping)

# Step 3: Encode titles as numeric
title_order = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Royalty": 5, "Officer": 6}
train_df['Title'] = train_df['Title'].map(title_order)
test_df['Title'] = test_df['Title'].map(title_order)

# Step 4: Drop 'Name' column
train_df.drop(columns=['Name'], inplace=True)
test_df.drop(columns=['Name'], inplace=True)

# Verify changes
print("Updated Train Dataset:")
display(train_df.head())

print("Updated Test Dataset:")
display(test_df.head())


In [None]:
# Step 1: Create 'FamilySize' feature
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# Step 2: Create 'IsAlone' feature (1 if FamilySize == 1, else 0)
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

# Step 3: Verify changes
print("Updated Train Dataset with FamilySize and IsAlone:")
display(train_df.head())

print("Updated Test Dataset with FamilySize and IsAlone:")
display(test_df.head())


In [None]:
# Check for missing values in X_train and X_val
print("Missing Values in Training Features:")
print(X_train.isnull().sum()[X_train.isnull().sum() > 0])

print("\nMissing Values in Validation Features:")
print(X_val.isnull().sum()[X_val.isnull().sum() > 0])

# Fill missing values with median (or mean as appropriate)
X_train.fillna(X_train.median(), inplace=True)
X_val.fillna(X_val.median(), inplace=True)

# Recheck for missing values after filling
print("\nMissing Values in Training Features (Post-Fill):")
print(X_train.isnull().sum().sum())

print("\nMissing Values in Validation Features (Post-Fill):")
print(X_val.isnull().sum().sum())


In [None]:
# Step 1: Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Step 2: Evaluate the model on validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print("Model Accuracy on Validation Set:", accuracy)
print("\nClassification Report:\n", classification_report(y_val, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))


In [None]:
# Step 1: Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Step 2: Train Random Forest model with class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Step 3: Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_val)
rf_accuracy = accuracy_score(y_val, y_pred_rf)

print("Random Forest Model Accuracy on Validation Set:", rf_accuracy)
print("\nClassification Report:\n", classification_report(y_val, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred_rf))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize Random Forest with class weights
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model performance
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)

# Evaluate on validation set
y_pred_tuned = best_model.predict(X_val)
accuracy_tuned = accuracy_score(y_val, y_pred_tuned)

print("Tuned Model Accuracy on Validation Set:", accuracy_tuned)
print("\nClassification Report:\n", classification_report(y_val, y_pred_tuned))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred_tuned))


In [None]:
# Step 1: Prepare the test dataset for prediction
X_test = test_df.drop(columns=['PassengerId', 'Ticket'])  # Drop unnecessary columns

# Step 2: Predict survival on the test dataset using the tuned model
test_df['Survived'] = best_model.predict(X_test)

# Step 3: Prepare submission file
submission = test_df[['PassengerId', 'Survived']]
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully!")
