# Module import

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Check the train data

In [2]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
test = pd.read_csv("/kaggle/input/titanic/test.csv")
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Convert ticket numbers to numeric values

In [4]:
# 関数を定義して、空白で分割し、数値のみの要素に置き換える
def extract_numbers(cell):
    # 空白で分割
    parts = cell.split()
    # 数値のみを抽出
    numbers = [part for part in parts if part.isdigit()]
    # 数値のみがあればそれを結合して返す、なければ空文字を返す
    return ' '.join(numbers) if numbers else ''

# 特定の列を処理
train['Ticket'] = train['Ticket'].apply(extract_numbers)
train['Ticket'] = pd.to_numeric(train['Ticket'], errors='coerce').astype('Int64')
test['Ticket'] = test['Ticket'].apply(extract_numbers)
test['Ticket'] = pd.to_numeric(test['Ticket'], errors='coerce').astype('Int64')

train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


# Extract honorifics from a name

In [5]:
import re

# List of titles
honorifics = ["Mr", "Mrs", "Miss", "Master"]

# Create a column for each title
for honorific in honorifics:
    # Check if the title is present using regular expression
    train[honorific] = train['Name'].apply(lambda name: bool(re.search(rf"\b{honorific}\b", name)))
    test[honorific] = test['Name'].apply(lambda name: bool(re.search(rf"\b{honorific}\b", name)))

print(train)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

# Fill the NaN data

In [6]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           4
Fare             0
Cabin          687
Embarked         2
Mr               0
Mrs              0
Miss             0
Master           0
dtype: int64

In [7]:
# 欠損値をすべて最頻値で埋める
train["Age"] = train["Age"].fillna(train["Age"].mode()[0])
train["Cabin"] = train["Cabin"].fillna(train["Cabin"].mode()[0])
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])

test["Age"] = test["Age"].fillna(test["Age"].mode()[0])
test["Fare"] = test["Fare"].fillna(test["Fare"].mode()[0])
test["Cabin"] = test["Cabin"].fillna(test["Cabin"].mode()[0])

# Dummy variables

In [8]:
train = pd.get_dummies(train, columns=['Sex','Embarked'])
test = pd.get_dummies(test, columns=['Sex','Embarked'])
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Mr,Mrs,Miss,Master,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,21171,7.25,B96 B98,True,False,False,False,False,True,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,17599,71.2833,C85,False,True,False,False,True,False,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,3101282,7.925,B96 B98,False,False,True,False,True,False,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,False,True,False,False,True,False,False,False,True
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,B96 B98,True,False,False,False,False,True,False,False,True


In [9]:
import lightgbm as lgb

# LightGBMの分類器を作成
clf = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# トレーニングデータの作成
X_train = train[["Pclass", "Age", "SibSp", "Parch", "Ticket", "Fare", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S"]]
y_train = train["Survived"]

# テストデータの作成
X_test = test[["Pclass", "Age", "SibSp", "Parch", "Ticket", "Fare", "Sex_female", "Sex_male", "Embarked_C", "Embarked_Q", "Embarked_S"]]

# モデルのトレーニング
clf.fit(X_train, y_train)

# 予測結果をy_testに格納
y_test = clf.predict(X_test)

# 予測結果の確認
print(y_test)

[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002548 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 481
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288
[0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0
 0 0 1 0 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 0 0 0 1
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 1 0 0 1 1 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0

# Output the answer

In [10]:
# PassengerId を取得
PassengerId = np.array(test["PassengerId"]).astype(int)
 
# y_test と PassengerId を結合
answer = pd.DataFrame(y_test, PassengerId, columns = ["Survived"])
 
# titanic_answer_01.csv としてエクスポート
answer.to_csv("titanic_answer_01.csv", index_label = ["PassengerId"])