In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Complete feature engineering function
def engineer_features(df, is_train=True):
    df = df.copy()
    
    # 1. Title extraction
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Countess': 'Rare', 'Ms': 'Miss', 'Lady': 'Rare',
        'Jonkheer': 'Rare', 'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs',
        'Capt': 'Rare', 'Sir': 'Rare'
    }
    df['Title'] = df['Title'].map(title_mapping).fillna('Rare')
    
    # 2. Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # 3. Age features
    df['Age'] = df.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
    df['Age'].fillna(df['Age'].median(), inplace=True)
    
    # 4. Fare features
    df['Fare'] = df.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    df['LogFare'] = np.log1p(df['Fare'])
    
    # 5. Cabin features
    df['HasCabin'] = df['Cabin'].notna().astype(int)
    df['Deck'] = df['Cabin'].str[0].fillna('U')
    
    # 6. Embarked
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # 7. Interaction features
    df['SexClass'] = df['Sex'] + '_' + df['Pclass'].astype(str)
    df['Age*Class'] = df['Age'] * df['Pclass']
    
    # Drop unnecessary columns
    drop_cols = ['Name', 'Ticket', 'Cabin', 'PassengerId']
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)
    
    return df

# Apply feature engineering
train_processed = engineer_features(train, is_train=True)
test_processed = engineer_features(test, is_train=False)

# Encode categorical variables
categorical_cols = ['Sex', 'Embarked', 'Title', 'Deck', 'SexClass']
encoders = {}

for col in categorical_cols:
    if col in train_processed.columns:
        le = LabelEncoder()
        train_processed[col] = le.fit_transform(train_processed[col].astype(str))
        if col in test_processed.columns:
            # Handle unseen categories in test set
            test_processed[col] = test_processed[col].astype(str).map(
                lambda x: le.transform([x])[0] if x in le.classes_ else -1
            )
        encoders[col] = le

# Prepare features
X = train_processed.drop('Survived', axis=1)
y = train_processed['Survived']
X_test = test_processed

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

### ADVANCED MODELING ###

# 1. Individual models with tuned parameters
models = {
    'lr': LogisticRegression(C=1.0, max_iter=1000, random_state=42),
    'dt': DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=42),
    'rf': RandomForestClassifier(n_estimators=200, max_depth=8, min_samples_split=10, random_state=42),
    'gbm': GradientBoostingClassifier(n_estimators=200, max_depth=4, learning_rate=0.05, random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=10, scoring='accuracy')
    results[name] = {
        'mean': scores.mean(),
        'std': scores.std()
    }
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

# 2. Create ensemble
ensemble = VotingClassifier([
    ('rf', models['rf']),
    ('gbm', models['gbm']),
    ('lr', models['lr'])
], voting='soft')

ensemble_scores = cross_val_score(ensemble, X_scaled, y, cv=10)
print(f"\nEnsemble: {ensemble_scores.mean():.4f} (+/- {ensemble_scores.std()*2:.4f})")

# 3. Train best model on full data
best_model = ensemble
best_model.fit(X_scaled, y)

# 4. Make predictions
predictions = best_model.predict(X_test_scaled)

# 5. Create submission
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
submission.to_csv('titanic_submission.csv', index=False)
print("\nSubmission saved to 'titanic_submission.csv'")

# 6. Quick validation checks
print(f"\nSubmission statistics:")
print(f"Survival rate: {submission['Survived'].mean():.2%}")
print(f"Total predictions: {len(submission)}")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/titanic/train.csv'