In [15]:
import numpy as np
import pandas as pd
from itertools import combinations

In [16]:
df = pd.read_csv('../data/train_and_test2.csv')

In [17]:
df.head(10)

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
0,1,22.0,7.25,0,1,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1,2,38.0,71.2833,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0,0,1
2,3,26.0,7.925,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
3,4,35.0,53.1,1,1,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,1
4,5,35.0,8.05,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
5,6,28.0,8.4583,0,0,0,0,0,0,0,...,0,0,0,3,0,0,1.0,0,0,0
6,7,54.0,51.8625,0,0,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,0
7,8,2.0,21.075,0,3,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
8,9,27.0,11.1333,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
9,10,14.0,30.0708,1,1,0,0,0,0,0,...,0,0,0,2,0,0,0.0,0,0,1


In [18]:
# PassengerId を削除
df = df.drop(columns=["Passengerid"], errors='ignore')

# 全ての値が0の列を削除
df = df.loc[:, (df != 0).any(axis=0)]


In [19]:
df.head(10)

Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,Embarked,2urvived
0,22.0,7.25,0,1,0,3,2.0,0
1,38.0,71.2833,1,1,0,1,0.0,1
2,26.0,7.925,1,0,0,3,2.0,1
3,35.0,53.1,1,1,0,1,2.0,1
4,35.0,8.05,0,0,0,3,2.0,0
5,28.0,8.4583,0,0,0,3,1.0,0
6,54.0,51.8625,0,0,0,1,2.0,0
7,2.0,21.075,0,3,1,3,2.0,0
8,27.0,11.1333,1,0,2,3,2.0,1
9,14.0,30.0708,1,1,0,2,0.0,1


In [20]:
# 2値データ以外を抽出
df_numeric = df[['Age', 'Fare', 'sibsp', 'Parch','Pclass','Embarked']]

# 平均値の計算
column_means = df_numeric.mean()

# 平均値を表示
print("各数値列の平均値：")
display(column_means)



各数値列の平均値：


Age         29.503186
Fare        33.281086
sibsp        0.498854
Parch        0.385027
Pclass       2.294882
Embarked     1.492731
dtype: float64

In [21]:
# 2値データ以外を抽出
numeric_cols = df_numeric.columns

# 各数値列ごとに、平均以上を1、未満を0として上書き
for col in numeric_cols:
    mean_val = df[col].mean()
    df[col] = (df[col] >= mean_val).astype(int)



In [22]:
df.head()

Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,Embarked,2urvived
0,0,0,0,1,0,1,1,0
1,1,1,1,1,0,0,0,1
2,0,0,1,0,0,1,1,1
3,1,1,1,1,0,0,1,1
4,1,0,0,0,0,1,1,0


In [23]:
# Survivedを削除
train_df = df.drop(columns=["2urvived"], errors='ignore')


In [24]:
df.to_csv('row_data.csv')
train_df.to_csv('train_data.csv')

In [25]:
def find_2_principal_points_sampled(df: pd.DataFrame, sample_size=200, seed=42) -> pd.DataFrame:
    np.random.seed(seed)
    df_sampled = df.sample(n=min(sample_size, len(df)), random_state=seed).reset_index(drop=True)
    X = df_sampled.values
    N = len(X)
    
    min_cost = float('inf')
    best_pair = (0, 1)

    for i, j in combinations(range(N), 2):
        cost = sum(min(np.sum(x != X[i]), np.sum(x != X[j])) for x in X)
        if cost < min_cost:
            min_cost = cost
            best_pair = (i, j)

    return df_sampled.iloc[list(best_pair)].reset_index(drop=True)


In [26]:
# Principal Points を計算
principal_df = find_2_principal_points_sampled(train_df)

# 結果表示と保存
display(principal_df)
principal_df.to_csv("principal_data.csv", index=False)


Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,Embarked
0,0,0,0,0,0,1,1
1,1,1,1,0,0,0,0


In [27]:
def find_2_principal_points_greedy(df: pd.DataFrame, seed: int = 42) -> pd.DataFrame:
    """
    貪欲法により、0/1バイナリデータの中から2つのPrincipal Pointsを選出。
    
    Parameters:
        df (pd.DataFrame): 0/1の特徴量をもつDataFrame
        seed (int): ランダム初期化のためのシード

    Returns:
        pd.DataFrame: 選ばれた2行のデータ（代表点）
    """
    np.random.seed(seed)
    X = df.values
    N = len(X)

    # ステップ1: 初期点をランダムに1つ選ぶ
    idx1 = np.random.randint(0, N)

    # ステップ2: 全体のコストが最小になるようなもう1点を選ぶ
    min_total_cost = float('inf')
    idx2 = None
    for j in range(N):
        if j == idx1:
            continue
        total_cost = 0
        for i in range(N):
            d = min(np.sum(X[i] != X[idx1]), np.sum(X[i] != X[j]))
            total_cost += d
        if total_cost < min_total_cost:
            min_total_cost = total_cost
            idx2 = j

    return df.iloc[[idx1, idx2]].reset_index(drop=True)


In [28]:
# Principal Points を計算
principal_df = find_2_principal_points_greedy(train_df)

# 結果表示と保存
display(principal_df)
principal_df.to_csv("principal_greedy_data.csv", index=False)

Unnamed: 0,Age,Fare,Sex,sibsp,Parch,Pclass,Embarked
0,0,0,0,0,0,1,1
1,1,1,1,1,0,0,1


In [29]:
#principal point
# 全体データ（Survived付き）
df_all = pd.read_csv("row_data.csv")
df_all = df_all.loc[:, ~df_all.columns.str.contains("^Unnamed")] 

# 代表点データ
df_principal = pd.read_csv("principal_data.csv")
df_principal = df_principal.loc[:, df_all.columns[:-1]]  

pp1 = df_principal.iloc[0].values
pp2 = df_principal.iloc[1].values


def assign_to_group(x, pp1, pp2):
    d1 = np.sum(x != pp1)
    d2 = np.sum(x != pp2)
    return 0 if d1 <= d2 else 1

features = df_all.iloc[:, :-1]       # 特徴量のみ
survived = df_all.iloc[:, -1]        # Survived列
X = features.values
assigned = np.array([assign_to_group(x, pp1, pp2) for x in X])


# 4. Survived とグループの対応付け
df_compare = pd.DataFrame({
    "AssignedGroup": assigned,
    "Survived": survived
})


# 5. 生存率と件数の集計
group_summary = df_compare.groupby("AssignedGroup")["Survived"].agg(["mean", "count"]).reset_index()
group_summary.rename(columns={"mean": "SurvivalRate", "count": "Count"}, inplace=True)

# 結果表示
print("=== 代表点による分類と生存率 ===")
print(group_summary)

# （任意）CSV出力
group_summary.to_csv("principal_group_survival_summary.csv", index=False)


=== 代表点による分類と生存率 ===
   AssignedGroup  SurvivalRate  Count
0              0      0.193089    984
1              1      0.467692    325


In [30]:
#greedy
# 全体データ（Survived付き）
df_all = pd.read_csv("row_data.csv")
df_all = df_all.loc[:, ~df_all.columns.str.contains("^Unnamed")] 

# 代表点データ
df_principal = pd.read_csv("principal_greedy_data.csv")
df_principal = df_principal.loc[:, df_all.columns[:-1]]  

pp1 = df_principal.iloc[0].values
pp2 = df_principal.iloc[1].values


def assign_to_group(x, pp1, pp2):
    d1 = np.sum(x != pp1)
    d2 = np.sum(x != pp2)
    return 0 if d1 <= d2 else 1

features = df_all.iloc[:, :-1]       # 特徴量のみ
survived = df_all.iloc[:, -1]        # Survived列
X = features.values
assigned = np.array([assign_to_group(x, pp1, pp2) for x in X])


# 4. Survived とグループの対応付け
df_compare = pd.DataFrame({
    "AssignedGroup": assigned,
    "Survived": survived
})


# 5. 生存率と件数の集計
group_summary = df_compare.groupby("AssignedGroup")["Survived"].agg(["mean", "count"]).reset_index()
group_summary.rename(columns={"mean": "SurvivalRate", "count": "Count"}, inplace=True)

# 結果表示
print("=== 代表点による分類と生存率 ===")
print(group_summary)

# （任意）CSV出力
group_summary.to_csv("principal_greedy_group_survival_summary.csv", index=False)


=== 代表点による分類と生存率 ===
   AssignedGroup  SurvivalRate  Count
0              0      0.186512    949
1              1      0.458333    360
