In [1]:
import pandas as pd

In [2]:
# データの読み込み
train_df = pd.read_csv('train.csv')
train_original_df = pd.read_csv('train.csv')
# データの基本情報
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
print(train_df.info)

<bound method DataFrame.info of      PassengerId HomePlanet CryoSleep     Cabin    Destination   Age    VIP  \
0        0001_01     Europa     False     B/0/P    TRAPPIST-1e  39.0  False   
1        0002_01      Earth     False     F/0/S    TRAPPIST-1e  24.0  False   
2        0003_01     Europa     False     A/0/S    TRAPPIST-1e  58.0   True   
3        0003_02     Europa     False     A/0/S    TRAPPIST-1e  33.0  False   
4        0004_01      Earth     False     F/1/S    TRAPPIST-1e  16.0  False   
...          ...        ...       ...       ...            ...   ...    ...   
8688     9276_01     Europa     False    A/98/P    55 Cancri e  41.0   True   
8689     9278_01      Earth      True  G/1499/S  PSO J318.5-22  18.0  False   
8690     9279_01      Earth     False  G/1500/S    TRAPPIST-1e  26.0  False   
8691     9280_01     Europa     False   E/608/S    55 Cancri e  32.0  False   
8692     9280_02     Europa     False   E/608/S    TRAPPIST-1e  44.0  False   

      RoomService  

In [4]:
# 主な特徴量の意味
# PassengerId : 乗客ID（グループIDと個別IDを含む）
# HomePlanet : 出発した惑星（例：Earth, Marsなど）
# CryoSleep : 冷凍睡眠中だったか（True/False）
# Cabin : キャビン番号（デッキ/番号/側）
# Destination : 目的地（例：TRAPPIST-1eなど）
# Age : 年齢
# VIP : VIPかどうか（True/False）
# Transported : 輸送されたかどうか（目的変数）

In [5]:
# 欠損値の合計確認
missing_values =train_df.isnull().sum()
print('欠損値一覧')
print(missing_values[missing_values > 0])

欠損値一覧
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
dtype: int64


In [6]:
# カテゴリー別のユニーク値をざっくり見る
cat_cols =['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
for col in cat_cols:
  print(f"\n🧠 {col} のユニーク値")
  print(train_df[col].value_counts(dropna=False))


🧠 HomePlanet のユニーク値
HomePlanet
Earth     4602
Europa    2131
Mars      1759
NaN        201
Name: count, dtype: int64

🧠 CryoSleep のユニーク値
CryoSleep
False    5439
True     3037
NaN       217
Name: count, dtype: int64

🧠 Cabin のユニーク値
Cabin
NaN         199
G/734/S       8
G/1368/P      7
G/109/P       7
C/21/P        7
           ... 
G/545/S       1
G/543/S       1
B/106/P       1
G/542/S       1
F/702/P       1
Name: count, Length: 6561, dtype: int64

🧠 Destination のユニーク値
Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
NaN               182
Name: count, dtype: int64

🧠 VIP のユニーク値
VIP
False    8291
NaN       203
True      199
Name: count, dtype: int64


In [7]:
# Cabin を分解
train_df[['Deck', 'CabinNum', 'Side']] = train_df['Cabin'].str.split('/', expand=True)
train_df[['Deck', 'CabinNum', 'Side']] = train_df[['Deck', 'CabinNum', 'Side']].fillna('Unknown')

In [8]:
# カテゴリー・ブール系は最頻値(mode)で補完
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
  train_df[col].fillna(train_df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].mode()[0], inplace=True)
  train_df[col].fillna(train_df[col].mode()[0], inplace=True)


In [9]:
# Age は中央値で補完
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)


In [10]:
# Cabinは不要
train_df.drop(columns='Cabin', inplace=True)

In [11]:
#name も不要
train_df.drop(columns='Name', inplace=True)

In [12]:
# 利用額がNaNの人は「0円」として補完
cols_money = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_df[cols_money] = train_df[cols_money].fillna(0)

In [13]:
train_df.isnull().sum()

Unnamed: 0,0
PassengerId,0
HomePlanet,0
CryoSleep,0
Destination,0
Age,0
VIP,0
RoomService,0
FoodCourt,0
ShoppingMall,0
Spa,0


In [14]:
# True/False を 1/0 に変換
train_df['CryoSleep'] = train_df['CryoSleep'].astype(int)
train_df['VIP'] = train_df['VIP'].astype(int)

In [15]:
# OneHotエンコーディング（drop_first=Trueでダミー変数落とす）
cat_cols = ['HomePlanet', 'Destination', 'Deck', 'Side']
train_df = pd.get_dummies(train_df, columns=cat_cols, drop_first=True)

In [16]:
train_df.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,Side_S,Side_Unknown
0,0001_01,0,39.0,0,0.0,0.0,0.0,0.0,0.0,False,...,True,False,False,False,False,False,False,False,False,False
1,0002_01,0,24.0,0,109.0,9.0,25.0,549.0,44.0,True,...,False,False,False,False,True,False,False,False,True,False
2,0003_01,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,...,False,False,False,False,False,False,False,False,True,False
3,0003_02,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,...,False,False,False,False,False,False,False,False,True,False
4,0004_01,0,16.0,0,303.0,70.0,151.0,565.0,2.0,True,...,False,False,False,False,True,False,False,False,True,False


In [17]:
# 読み込み
test_df = pd.read_csv('test.csv')

# test_dfの基本情報
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [18]:
# Cabin を分解
test_df[['Deck', 'CabinNum', 'Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df[['Deck', 'CabinNum', 'Side']] = test_df[['Deck', 'CabinNum', 'Side']].fillna('Unknown')

In [19]:
# カテゴリ系補完
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
  test_df[col].fillna(train_original_df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(train_original_df[col].mode()[0], inplace=True)
  test_df[col].fillna(train_original_df[col].mode()[0], inplace=True)


In [20]:
test_df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Deck', 'CabinNum', 'Side'],
      dtype='object')

In [21]:
# Ageは中央値
test_df['Age'].fillna(train_original_df['Age'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(train_original_df['Age'].median(), inplace=True)


In [22]:
# 利用額系は0で補完
cols_money = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_df[cols_money] = test_df[cols_money].fillna(0)

In [23]:
# 不要列削除
test_df.drop(columns=['Cabin', 'Name'], inplace=True)

In [24]:
# True/False を 1/0
test_df['CryoSleep'] = test_df['CryoSleep'].astype(int)
test_df['VIP'] = test_df['VIP'].astype(int)

In [25]:
# OneHotエンコーディング（trainと同じカテゴリに合わせる！）
test_df = pd.get_dummies(test_df, columns=cat_cols, drop_first=True)

In [26]:
# trainにあるカラムにtestを合わせる（不足があったら補完）
missing_cols = set(train_df.columns) - set(test_df.columns)
missing_cols.discard('Transported')
for col in missing_cols:
  test_df[col] = 0 #存在しない列は0で補完

In [27]:
# trainに存在しない列を削除（念のため）
test_df =test_df[train_df.drop(columns='Transported').columns]

In [28]:
# CabinNum を数値に変換してみよう
train_df['CabinNum'] = pd.to_numeric(train_df['CabinNum'], errors='coerce').fillna(-1)
test_df['CabinNum'] = pd.to_numeric(test_df['CabinNum'], errors='coerce').fillna(-1)

In [29]:
# PassengerId を数値に変換してみよう
train_df['PassengerId'] = pd.to_numeric(train_df['PassengerId'], errors='coerce').fillna(-1)
test_df['PassengerId'] = pd.to_numeric(test_df['PassengerId'], errors='coerce').fillna(-1)

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [31]:
# 目的変数
Y = train_df['Transported'].astype(int)

# 説明変数(Transported 以外)
X = train_df.drop(columns='Transported')

In [32]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

In [34]:
train_df.dtypes

Unnamed: 0,0
PassengerId,float64
CryoSleep,int64
Age,float64
VIP,int64
RoomService,float64
FoodCourt,float64
ShoppingMall,float64
Spa,float64
VRDeck,float64
Transported,bool


In [33]:
# モデル学習
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
# 精度評価
Y_pred = model.predict(X_valid)
accuracy = accuracy_score(Y_valid, Y_pred)
print(f'Accuracy: {accuracy}')
print("📊 Classification Report:\n", classification_report(Y_valid, Y_pred))

Accuracy: 0.7809085681426107
📊 Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.74      0.77       861
           1       0.76      0.83      0.79       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



In [40]:
# 予測
test_pred = model.predict(test_df)

# sample_submission の読み込み
sub_df = pd.read_csv('sample_submission.csv')

# 予測結果を代入（True/False の bool 型に戻す）
sub_df['Transported'] = test_pred.astype(bool)

# ファイル出力
sub_df.to_csv('submission.csv', index=False)