# Titanic-Test

In [1]:
%pwd

'D:\\OneDrive - Kyushu University\\Course01_DataScienceForBa\\KaggleTitanicCode'

In [2]:
cd ..

D:\OneDrive - Kyushu University\Course01_DataScienceForBa


## Import Package

In [3]:
import numpy as np
import os
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold

## Load DAta

In [4]:
all_df = pd.read_csv("Data/train.csv")
all_df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [None]:
all_df.shape

In [None]:
all_df.columns

## Preprocess

In [None]:
all_df = all_df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin', 'Embarked']]

In [None]:
all_df.head()

In [None]:
all_df['Cabin_Count'] = all_df['Cabin'].str.count(' ') + 1
all_df['Cabin_level'] = all_df['Cabin'].str[0]

In [None]:
def extract_first_number(cabin_string):
    if pd.isna(cabin_string):
        return None  # Return None if the cabin string is NaN
    numbers = re.findall(r'\d+', cabin_string)
    return int(numbers[0]) if numbers else None

all_df['First_Number'] = all_df['Cabin'].apply(extract_first_number)

In [None]:
all_df['Cabin_level'].value_counts()

In [None]:
all_df['Sex'] = all_df['Sex'].replace({'male': 0, 'female': 1})
cabin_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
all_df['Cabin_level'] = all_df['Cabin_level'].map(cabin_mapping)

In [None]:
Embarked_mapping = {'S': 1, 'C': 2, 'Q': 3}
all_df['Embarked'] = all_df['Embarked'].map(Embarked_mapping)

In [None]:
all_df.head()

In [None]:
all_df.columns

In [None]:
all_df = all_df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked', 'Cabin_Count', 'Cabin_level', 'First_Number']]

In [None]:
all_df = all_df.fillna(0) 

In [None]:
all_df.head()

In [None]:
X, y = all_df.drop(columns = ['Survived']), all_df['Survived']

## Split X and y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=326)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=326)  # n_estimators is the number of trees
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

## 10 fold CV

In [None]:
param_grid = {
    'n_estimators': [25, 50, 75, 100],  # Number of trees in the forest
    'max_features': [6, 7, 8, 9, 10],  # Number of features to consider at every split
    'max_depth': [10, 12, 14, 16, 18],  # Maximum number of levels in tree
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'min_samples_split':[2, 4, 6, 8, 10],
    'max_samples': [0.7, 0.8, 0.9]
    
}


In [None]:

rf_classifier = RandomForestClassifier(random_state=42)
stratified_kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=326)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=stratified_kfold, 
                           scoring='accuracy', verbose=1, n_jobs=-1)


In [None]:
grid_search.fit(X, y)

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score (best accuracy):", grid_search.best_score_)

In [None]:
pd.DataFrame(grid_search.cv_results_).sort_values(by='mean_test_score', ascending=False).head(10)

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
top_models = results.nlargest(10, 'mean_test_score')

In [None]:
base_models = []
for index, row in top_models.iterrows():
    model = RandomForestClassifier(n_estimators=row['param_n_estimators'],
                                   max_features=row['param_max_features'],
                                   max_depth=row['param_max_depth'],
                                   criterion=row['param_criterion'],
                                   min_samples_split=row['param_min_samples_split'],
                                   max_samples=row['param_max_samples'],
                                   random_state=index)
    base_models.append(('rf_{}'.format(index), model))


In [None]:
# Logistic Regression as the final estimator
final_estimator = LogisticRegression()

# Stacking Classifier
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=final_estimator, cv=10)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
stacking_classifier.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = stacking_classifier.predict(X_test)

# Performance
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Stacking model accuracy:", accuracy)


final_estimator = LogisticRegression()
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=final_estimator, cv=10)
stacking_classifier.fit(X, y)

## Prediction

In [None]:
test_df = all_df = pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
Results = test_df[['PassengerId']]

In [None]:
test_df = test_df[[ 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Cabin', 'Embarked']]

In [None]:
test_df.head()

In [None]:
test_df['Cabin_Count'] = test_df['Cabin'].str.count(' ') + 1
test_df['Cabin_level'] = test_df['Cabin'].str[0]

In [None]:
def extract_first_number(cabin_string):
    if pd.isna(cabin_string):
        return None  # Return None if the cabin string is NaN
    numbers = re.findall(r'\d+', cabin_string)
    return int(numbers[0]) if numbers else None

test_df['First_Number'] = test_df['Cabin'].apply(extract_first_number)

In [None]:
test_df['Sex'] = test_df['Sex'].replace({'male': 0, 'female': 1})
cabin_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
test_df['Cabin_level'] = test_df['Cabin_level'].map(cabin_mapping)

In [None]:
Embarked_mapping = {'S': 1, 'C': 2, 'Q': 3}
test_df['Embarked'] = test_df['Embarked'].map(Embarked_mapping)

In [None]:
test_df.head()

In [None]:
test_df.columns

In [None]:
test_df = test_df[['Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked', 'Cabin_Count', 'Cabin_level', 'First_Number']]

In [None]:
test_df = test_df.fillna(0) 

In [None]:
pred_y = stacking_classifier.predict(test_df)

In [None]:
Results['Survived'] = pred_y

In [None]:
Results

In [None]:
Results.to_csv('submission.csv', index=False)