In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("titanic/train.csv")
test = pd.read_csv("titanic/test.csv")
test_ids = test["PassengerId"]

data.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
# Define the clean function with feature engineering
def clean(data):
    # Check and drop columns that are less likely to impact the analysis
    columns_to_drop = ["Ticket", "Name", "PassengerId"]
    for col in columns_to_drop:
        if col in data.columns:
            data = data.drop(col, axis=1)
    
    # Fill missing values in specified columns with the mean
    cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in cols:
        if col in data.columns:
            data[col].fillna(data[col].mean(), inplace=True)

    # Fill missing values in 'Embarked' with 'U' for unknown
    if 'Embarked' in data.columns:
        data.Embarked.fillna("U", inplace=True)
    
    # Clean the 'Cabin' data
    if 'Cabin' in data.columns:
        data['Cabin'] = data['Cabin'].fillna('0').apply(lambda x: '0' if x == '0' else str(ord(x[0].upper()) - ord('A') + 1))
    
    # Feature Engineering: Create new features
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    
    return data

In [4]:
# Clean the data
cleaned_data = clean(data)
cleaned_test = clean(test)

In [5]:
# Encode categorical variables
cleaned_data = pd.get_dummies(cleaned_data, drop_first=True)
cleaned_test = pd.get_dummies(cleaned_test, drop_first=True)

In [6]:
# Ensure the test set has the same columns as the training set
missing_cols = set(cleaned_data.columns) - set(cleaned_test.columns)
for col in missing_cols:
    cleaned_test[col] = 0

In [7]:
# Drop 'Survived' from the test set
cleaned_test = cleaned_test.drop('Survived', axis=1, errors='ignore')

# Ensure columns are in the same order
cleaned_test = cleaned_test[cleaned_data.drop('Survived', axis=1).columns]

# Separate features and target variable
y = cleaned_data["Survived"]
X = cleaned_data.drop("Survived", axis=1)

In [8]:
# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)
cleaned_test = scaler.transform(cleaned_test)

In [9]:
# Initialize and tune the model using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

Best parameters found: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 0.8305818843763731


In [10]:
# Use the best estimator to make predictions on the test set
best_rf = grid_search.best_estimator_
test_predictions = best_rf.predict(cleaned_test)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": test_predictions
})

# Save the submission to a CSV file
submission.to_csv("submission-better.csv", index=False)
