In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# データの読み込み
train_data_senkei = pd.read_csv('selected_train_set.csv')
test_data_senkei = pd.read_csv('selected_test_set.csv')

# 特徴量（X）と目標変数（y）の設定
X_train_senkei = train_data_senkei.drop(columns=['PassengerId','Survived'])
y_train_senkei = train_data_senkei['Survived']
X_test_senkei = test_data_senkei.drop(columns=['PassengerId','Survived'])
y_test_senkei = test_data_senkei['Survived']

# データの標準化
scaler_senkei = StandardScaler()
X_train_senkei = scaler_senkei.fit_transform(X_train_senkei)
X_test_senkei = scaler_senkei.transform(X_test_senkei)

param_grid_linear_senkei = {'C': [0.01, 0.1, 1, 10, 100]}
grid_search_linear_senkei = GridSearchCV(SVC(kernel='linear', probability=True, random_state=42), param_grid_linear_senkei, cv=10, scoring='accuracy')
grid_search_linear_senkei.fit(X_train_senkei, y_train_senkei)
best_model_senkei = grid_search_linear_senkei.best_estimator_

# 線形カーネルのSVMモデルを10回評価 class_weight='balanced'不均衡データに対応するコード0.5→あまり変わらず
linear_accuracies_senkei = []

for i in range(10):
    svm_linear_senkei = SVC(kernel='linear', probability=True, random_state=42,class_weight='balanced')
    svm_linear_senkei.fit(X_train_senkei, y_train_senkei)
    y_pred_senkei = best_model_senkei.predict(X_test_senkei)
    #y_pred_linear_senkei = svm_linear_senkei.predict(X_test_svm_senkei)
    accuracy_senkei = accuracy_score(y_test_senkei, y_pred_senkei)
    linear_accuracies_senkei.append(accuracy_senkei)

# 線形カーネルSVMの平均精度の表示
print("線形カーネルSVMの10回の平均精度:", np.mean(linear_accuracies_senkei))
print("線形カーネルSVMの詳細な評価結果:\n", classification_report(y_test_senkei, y_pred_senkei))


In [None]:
import matplotlib.pyplot as plt
# 特徴量の重要度の計算
importances_senkei = best_model_senkei.coef_[0]  # 線形SVMの係数が重要度として使える
feature_importance_df_senkei = pd.DataFrame({
    'Feature': train_data_senkei.drop(columns=['PassengerId','Survived']).columns,
    'Importance': np.abs(importances_senkei)  # 絶対値を取って重要度を見やすくする
})

# 重要度でソートして上位20項目を抽出
feature_importance_df_senkei = feature_importance_df_senkei.sort_values(by='Importance', ascending=False)
top_13_importance_df_senkei = feature_importance_df_senkei.head(13)

# グラフの作成
plt.figure(figsize=(10, 8))
plt.barh(top_13_importance_df_senkei['Feature'], top_13_importance_df_senkei['Importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Top 13 Feature Importances (SVM Linear Kernel)')
plt.gca().invert_yaxis()
plt.show()

# モデルの精度評価
y_pred_linear_senkei = best_model_senkei.predict(X_test_senkei)
accuracy_linear_senkei = accuracy_score(y_test_senkei, y_pred_senkei)
print("線形カーネルSVMのテスト精度:", accuracy_senkei)
print("線形カーネルSVMの詳細な評価結果:\n", classification_report(y_test_senkei, y_pred_senkei))

In [None]:
from sklearn.metrics import roc_curve, auc,confusion_matrix

# テストデータで予測確率を取得
y_scores_senkei = best_model_senkei.predict_proba(X_test_senkei)[:, 1]

# ROC計算用のしきい値を決定（スコアの一意な値を使用）
thresholds_senkei = np.sort(np.unique(y_scores_senkei))

# 感度（TPR）と特異度（1 - FPR）を計算
tpr_list_senkei = []
specificity_list_senkei = []

for threshold in thresholds_senkei:
    # スコアをしきい値で2値化
    y_pred_senkei = (y_scores_senkei >= threshold).astype(int)
    
    # 混同行列を取得
    tn_senkei, fp_senkei, fn_senkei, tp_senkei = confusion_matrix(y_test_senkei, y_pred_senkei).ravel()
    
    # 感度と特異度を計算
    tpr_senkei = tp_senkei / (tp_senkei+ fn_senkei) if (tp_senkei + fn_senkei) > 0 else 0  # 感度
    specificity_senkei = tn_senkei / (tn_senkei + fp_senkei) if (tn_senkei + fp_senkei) > 0 else 0  # 特異度
    
    # リストに保存
    tpr_list_senkei.append(tpr_senkei)
    specificity_list_senkei.append(specificity_senkei)

# AUCの計算
roc_auc_senkei = auc(np.array(specificity_list_senkei), tpr_list_senkei)

# ROC曲線を描画
plt.figure(figsize=(8, 6))
plt.plot( np.array(specificity_list_senkei), tpr_list_senkei, label=f"ROC Curve (AUC = {roc_auc_senkei:.2f})", color="blue")

plt.plot([1, 0], [0, 1], color="red", linestyle="--")  # 45度線
plt.xlim([1.0, 0.0])  # X軸を1から0に設定
plt.xlabel("Specificity ")
plt.ylabel("Sensitivity")
plt.title("ROC Curve (Specificity vs Sensitivity)")
plt.legend(loc="lower left")
plt.grid()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
y_pred_senkei = best_model_senkei.predict(X_test_senkei)
# 混同行列の計算
cm = confusion_matrix(y_test_senkei, y_pred_senkei)

# 混同行列の表示
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

# 混同行列の要素を使って感度と特異度を計算
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)  # 感度
specificity = tn / (tn + fp)  # 特異度

print(f"Sensitivity: {sensitivity:.2f}")
print(f"Specificity: {specificity:.2f}")