<a href="https://colab.research.google.com/github/Kou-python/MyUniProjects/blob/master/Titanic_compe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ライブラリ、マウント関連

In [None]:
# このセルを実行したあとどうすればいいかがわからない方は、別途配布している、「GoogleColabにおけるドライブのマウントの仕方」と、Google Drive上でのフォルダ構造の説明のpdfファイルをご覧ください。

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
# 読み込むデータが格納されたディレクトリのパス，必要に応じて変更の必要あり
# データの読み込み方がわからない方は、別途配布している、GoogleColabにおけるドライブのマウントの仕方、と、Google Drive上でのフォルダ構造の説明のpdfファイルをご覧ください。
path = "/content/drive/My Drive/Colab Notebooks/GCI_2023_Winter/Competitions/competition_1/"

# 前処理

## 欠損値処理

In [None]:
df = pd.read_csv(path + 'data/train.csv')
df_test = pd.read_csv(path + 'data/test.csv')
# 平均取るためにconcat(縦結合)
age = pd.concat([df['Age'], df_test['Age']])
fare = pd.concat([df['Fare'], df_test['Fare']])

# Age処理
df['Age'].fillna(age.median(), inplace=True)
df_test['Age'].fillna(age.median(), inplace=True)
# Fare処理
df['Fare'].fillna(fare.median(), inplace=True)
df_test['Fare'].fillna(fare.median(), inplace=True)
# 欠損値確認
print(df.isnull().sum())
print(df_test.isnull().sum())

PassengerId      0
Perished         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


## Age rfc
後回し（特徴量作成後）

In [None]:
# # .iloc[行, 列],X: 説明変数, Y: 目的変数
# # 入力変数(Pclass列以降のバリュー値)
# X = df.drop('Age', axis=1).values
# #目的変数（Perished列のバリュー値）
# y = df["Age"].values

# # 訓練データとテストデータに分ける
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

# rfc = RandomForestClassifier(max_depth=3, min_samples_leaf=3, n_estimators=100, n_jobs=-1, random_state=42)
# rfc.fit(X_train, y_train)
# print('Train Score: {}'.format(round(rfc.score(X_train, y_train), 3)))
# print(' Test Score: {}'.format(round(rfc.score(X_valid, y_valid), 3)))

# 前処理＆特徴量作成

In [None]:
df = pd.read_csv(path + 'data/train.csv')
df_test = pd.read_csv(path + 'data/test.csv')
# 平均取るためにconcat(縦結合)
age = pd.concat([df['Age'], df_test['Age']])
fare = pd.concat([df['Fare'], df_test['Fare']])

# Age処理
df['Age'].fillna(age.median(), inplace=True)
df_test['Age'].fillna(age.median(), inplace=True)
# Fare処理
df['Fare'].fillna(fare.median(), inplace=True)
df_test['Fare'].fillna(fare.median(), inplace=True)
# 欠損値確認
print(df.isnull().sum())
print(df_test.isnull().sum())

# Embarked(One-Hot Encoding)
df['Embarked'].fillna("S", inplace=True)
df_test['Embarked'].fillna("S", inplace=True)
embarked = pd.concat([df['Embarked'], df_test['Embarked']])
embarked_ohe = pd.get_dummies(embarked)
embarked_ohe_train = embarked_ohe[:891]
embarked_ohe_test = embarked_ohe[891:]
df = pd.concat([df, embarked_ohe_train], axis=1)
df_test = pd.concat([df_test, embarked_ohe_test], axis=1)
df.drop('Embarked', axis=1, inplace=True)
df_test.drop('Embarked', axis=1, inplace=True)

# Sex
df.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)
df_test.replace({'Sex': {'male': 0, 'female': 1}}, inplace=True)

# Cabin
df["Cabin"]=df["Cabin"].replace(np.nan, "N")
df_test["Cabin"]=df_test["Cabin"].replace(np.nan, "N")

# Name
df["Name"] = np.where(df["Name"].str.contains("Mrs"),1,0)
df_test["Name"] = np.where(df_test["Name"].str.contains("Mrs"),1,0)

# Ticket削除
df.drop('Ticket', axis=1, inplace=True)
df_test.drop('Ticket', axis=1, inplace=True)

PassengerId      0
Perished         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64


In [None]:
# Cabin頭文字
cabins_dict = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'N':4,
    "F":5,
    "E":6,
    "G":7,
    "T":8
}

df['Cabin'] = df['Cabin'].map(lambda x: str(x)[0])
df_test['Cabin'] = df_test['Cabin'].map(lambda x: str(x)[0])
df["Cabin"] = df["Cabin"].replace(cabins_dict)
df_test["Cabin"] = df_test["Cabin"].replace(cabins_dict)

In [None]:
df.head(100)

Unnamed: 0,PassengerId,Perished,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,C,Q,S
0,1,1,3,0,0,22.0,1,0,7.2500,4,0,0,1
1,2,0,1,1,1,38.0,1,0,71.2833,2,1,0,0
2,3,0,3,0,1,26.0,0,0,7.9250,4,0,0,1
3,4,0,1,1,1,35.0,1,0,53.1000,2,0,0,1
4,5,1,3,0,0,35.0,0,0,8.0500,4,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,1,3,0,0,28.0,0,0,8.0500,4,0,0,1
96,97,1,1,0,0,71.0,0,0,34.6542,0,1,0,0
97,98,0,1,0,0,23.0,0,1,63.3583,3,1,0,0
98,99,0,2,1,1,34.0,0,1,23.0000,4,0,0,1


In [None]:
df_fe = df.copy()
df_fe_test = df_test.copy()

X_fe = df_fe.iloc[:, 2:].values
y_fe = df_fe.iloc[:, 1].values

X_fe_test = df_fe_test.iloc[:, 1:].values

X_fe_train, X_fe_valid, y_fe_train, y_fe_valid = train_test_split(X_fe, y_fe, test_size=0.3, random_state=42)

rfc_fe = RandomForestClassifier(max_depth=9, min_samples_leaf=3, n_estimators=100, n_jobs=-1, random_state=42)
rfc_fe.fit(X_fe_train, y_fe_train)

print('Train Score: {}'.format(round(rfc_fe.score(X_fe_train, y_fe_train), 3)))
print(' Test Score: {}'.format(round(rfc_fe.score(X_fe_valid, y_fe_valid), 3)))

Train Score: 0.888
 Test Score: 0.802


In [None]:
# param_grid = {'max_depth': range(3,11),
#               'min_samples_leaf': range(1,5)}

# rfc_gs = GridSearchCV(RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42), param_grid, cv=5)
# rfc_gs.fit(X_fe, y_fe)

# print('Best Parameters: {}'.format(rfc_gs.best_params_))
# print('CV Score: {}'.format(round(rfc_gs.best_score_, 3)))

Best Parameters: {'max_depth': 9, 'min_samples_leaf': 3}
CV Score: 0.829


In [None]:
submission = pd.read_csv(path + 'gender_submission.csv')
submission
# rfc_fe=rfc_fe.predict_proba(X_fe_test)
pred = rfc_fe.argmax(axis=1)
submission["Perished"]=pred

In [None]:
# Google Drive・Google Colaboratoryで作業する場合
submission.to_csv('submission.csv',index=False)

from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>