In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# データの読み込み
train_df = pd.read_csv('/home/haruki/kaggle/mnist/data/train.csv')

# 特徴量とラベルの分割
X = train_df.drop(columns=['label']).values  # 画像データ
y = train_df['label'].values  # ラベル

# データの前処理（スケーリング）
X = X / 255.0  # ピクセルの値を0~1の範囲に正規化

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# GDAモデル
class GDA:
    def __init__(self, lambda_reg=1e-3, epsilon=1e-10):
        self.lambda_reg = lambda_reg  # 正則化パラメータ
        self.epsilon = epsilon  # 行列式のゼロ除算を避けるための定数
        
    def fit(self, X, y):
        classes = np.unique(y)
        n, d = X.shape
        
        # クラスごとの平均と共分散行列の計算
        self.mean = {}
        self.cov = np.zeros((d, d))
        self.prior = {}
        
        for c in classes:
            X_c = X[y == c]
            self.mean[c] = np.mean(X_c, axis=0)
            self.prior[c] = X_c.shape[0] / n
            self.cov += np.cov(X_c, rowvar=False) * (X_c.shape[0] / n)
        
        # 共分散行列に正則化項を追加
        self.cov += self.lambda_reg * np.identity(d)
    
    def predict(self, X):
        n, d = X.shape
        y_pred = np.zeros(n)
        
        # クラスごとの予測計算
        prob = np.zeros((len(np.unique(y_pred)), n))  # クラスごとの確率
        
        for c in self.mean:
            mean = self.mean[c]
            prior = self.prior[c]
            cov_inv = np.linalg.inv(self.cov)  # 共分散行列の逆行列
            
            diff = X - mean
            exponent = -0.5 * np.sum(diff @ cov_inv * diff, axis=1)
            
            # 行列式に対するゼロ除算対策
            cov_det = np.linalg.det(self.cov)
            if cov_det < self.epsilon:
                cov_det = self.epsilon
            
            log_prob = exponent - 0.5 * d * np.log(2 * np.pi) - 0.5 * np.log(cov_det) + np.log(prior)
            
            if c == 0:
                prob = log_prob
            else:
                prob = np.vstack((prob, log_prob))
        
        # 最大の確率を持つクラスを予測として選択
        y_pred = np.argmax(prob, axis=0)
        
        return y_pred

class LDA:
    def __init__(self, lambda_reg=1e-3, epsilon=1e-10):
        self.lambda_reg = lambda_reg  # 正則化パラメータ
        self.epsilon = epsilon  # 行列式のゼロ除算を避けるための定数
        self.mean = {}
        self.prior = {}
        self.cov = None
    
    def fit(self, X, y):
        classes = np.unique(y)
        n, d = X.shape
        
        # クラスごとの平均と共分散行列の計算
        self.mean = {}
        self.prior = {}
        self.cov = np.zeros((d, d))
        
        for c in classes:
            X_c = X[y == c]
            self.mean[c] = np.mean(X_c, axis=0)
            self.prior[c] = X_c.shape[0] / n
            self.cov += np.cov(X_c, rowvar=False) * (X_c.shape[0] / n)
        
        # 共分散行列に正則化項を追加
        self.cov += self.lambda_reg * np.identity(d)  # 正則化
        cov_det = np.linalg.det(self.cov)
        
        # 行列式がゼロに近い場合、epsilonを加える
        if cov_det < self.epsilon:
            self.cov += self.epsilon * np.identity(d)
    
    def predict(self, X):
        n, d = X.shape
        y_pred = np.zeros(n)
        
        # クラスごとの予測計算
        prob = np.zeros((len(np.unique(y_pred)), n))  # クラスごとの確率
        
        for c in self.mean:
            mean = self.mean[c]
            prior = self.prior[c]
            cov_inv = np.linalg.inv(self.cov)  # 共分散行列の逆行列
            
            diff = X - mean
            exponent = -0.5 * np.sum(diff @ cov_inv * diff, axis=1)
            
            # クラスの対数確率
            log_prob = exponent - 0.5 * d * np.log(2 * np.pi) - 0.5 * np.log(np.linalg.det(self.cov)) + np.log(prior)
            
            if c == 0:
                prob = log_prob
            else:
                prob = np.vstack((prob, log_prob))
        
        # 最大の確率を持つクラスを予測として選択
        y_pred = np.argmax(prob, axis=0)
        
        return y_pred

# GDAとLDAの訓練と評価のコード（既に実行したものと同様）
gda = GDA(lambda_reg=1e-3, epsilon=1e-10)
gda.fit(X_train_scaled, y_train)
y_pred_gda = gda.predict(X_test_scaled)
accuracy_gda = accuracy_score(y_test, y_pred_gda)

lda = LDA(lambda_reg=1e-3, epsilon=1e-10)  # 正則化パラメータとepsilonを指定
lda.fit(X_train_scaled, y_train)
y_pred_lda = lda.predict(X_test_scaled)
accuracy_lda = accuracy_score(y_test, y_pred_lda)

# 結果の表示
print(f'GDA Accuracy: {accuracy_gda * 100:.2f}%')
print(f'LDA Accuracy: {accuracy_lda * 100:.2f}%')

  log_prob = exponent - 0.5 * d * np.log(2 * np.pi) - 0.5 * np.log(np.linalg.det(self.cov)) + np.log(prior)


GDA Accuracy: 86.64%
LDA Accuracy: 9.71%
