In [114]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [115]:
#教師データの作成
#テーブルデータの取得
df=pd.read_csv('.././data/preprocessed_data/df1.csv',index_col=0)
#ラベルの取得
label_df=df['dengue']
df=df.drop('dengue',axis=1)

In [116]:
#正規化
numerical_columns =df.columns.to_list()
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
data_preprocessed = preprocessor.fit_transform(df)

In [117]:
# 統合データフレーム
features_df = pd.DataFrame(data_preprocessed)
features_df.columns = features_df.columns.astype(str)   #カラム名をstr型に

In [118]:
features_df

Unnamed: 0,0,1,2
0,-0.422272,-1.461331,-0.496247
1,1.567248,-0.027529,-0.496247
2,0.812010,-1.461331,2.015125
3,0.351605,-0.027529,-0.496247
4,-1.176823,-0.027529,-0.496247
...,...,...,...
4995,1.383145,-1.461331,-0.496247
4996,-1.303332,-0.027529,-0.496247
4997,-0.583517,1.406273,-0.496247
4998,1.101202,-0.027529,-0.496247


In [119]:
#クラスタリング
kmeans = KMeans(n_clusters=2, random_state=0).fit(features_df)
features_df['kmeans_dist'] = kmeans.transform(features_df).min(axis=1)
features_df['kmeans_cluster'] = kmeans.labels_

dbscan = DBSCAN(eps=0.5, min_samples=2).fit(features_df)
features_df['dbscan_cluster'] = dbscan.labels_

In [120]:
#次元削減
pca = PCA(n_components=2, random_state=0)
pca_features = pca.fit_transform(features_df)
features_df['pca_1'] = pca_features[:, 0]
features_df['pca_2'] = pca_features[:, 1]

In [121]:
#異常検知
isolation_forest = IsolationForest(contamination=0.1, random_state=0)
features_df['anomaly_score'] = isolation_forest.fit_predict(features_df)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#自己符号化器
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 10),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(10, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

input_dim = features_df.shape[1]
autoencoder = Autoencoder(input_dim).to(device) 
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.01)

# データをテンソルに変換し、GPUに転送
data_tensor = torch.tensor(features_df.values, dtype=torch.float32).to(device)

# 訓練
num_epochs = 500
for epoch in range(num_epochs):
    output = autoencoder(data_tensor)
    loss = criterion(output, data_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# エンコード層の出力を取得
with torch.no_grad():
    encoded_features = autoencoder.encoder(data_tensor).cpu().numpy()  # 出力をCPUに戻してnumpy配列に変換

# DataFrameにエンコードされた特徴量を追加
encoded_df = pd.DataFrame(encoded_features, columns=[f'autoencoder_{i}' for i in range(encoded_features.shape[1])])
features_df = pd.concat([features_df, encoded_df], axis=1)

Epoch [20/500], Loss: 1.1508
Epoch [40/500], Loss: 0.9683
Epoch [60/500], Loss: 0.9145
Epoch [80/500], Loss: 0.8881
Epoch [100/500], Loss: 0.8772
Epoch [120/500], Loss: 0.8712
Epoch [140/500], Loss: 0.8668
Epoch [160/500], Loss: 0.8642
Epoch [180/500], Loss: 0.8622
Epoch [200/500], Loss: 0.8604
Epoch [220/500], Loss: 0.8584
Epoch [240/500], Loss: 0.8566
Epoch [260/500], Loss: 0.8551
Epoch [280/500], Loss: 0.8540
Epoch [300/500], Loss: 0.8532
Epoch [320/500], Loss: 0.8527
Epoch [340/500], Loss: 0.8523
Epoch [360/500], Loss: 0.8521
Epoch [380/500], Loss: 0.8519
Epoch [400/500], Loss: 0.8517
Epoch [420/500], Loss: 0.8516
Epoch [440/500], Loss: 0.8515
Epoch [460/500], Loss: 0.8514
Epoch [480/500], Loss: 0.8513
Epoch [500/500], Loss: 0.8512


In [123]:
#距離・類似度ベース
cos_sim_matrix = cosine_similarity(features_df)
nearest_distances = cosine_similarity(features_df).mean(axis=1)
features_df['nearest_cosine_similarity'] = nearest_distances

In [124]:
features_df.head()

Unnamed: 0,0,1,2,kmeans_dist,kmeans_cluster,dbscan_cluster,pca_1,pca_2,anomaly_score,autoencoder_0,autoencoder_1,autoencoder_2,autoencoder_3,autoencoder_4,autoencoder_5,autoencoder_6,autoencoder_7,autoencoder_8,autoencoder_9,nearest_cosine_similarity
0,-0.422272,-1.461331,-0.496247,1.517383,1,0,-2.240731,-0.852707,1,6.461226,6.834682,0.0,0.0,0.0,0.0,2.335813,0.0,2.484711,2.440178,0.605654
1,1.567248,-0.027529,-0.496247,1.574743,1,1,-0.887794,0.472632,1,1.769042,4.704268,0.0,1.619279,0.0,0.0,0.0,0.534693,5.124841,6.663355,0.706813
2,0.81201,-1.461331,2.015125,1.664053,0,2,0.624591,-2.424086,-1,2.763214,2.181794,0.0,0.0,5.023939,0.0,4.176839,0.0,10.944354,0.0,0.473169
3,0.351605,-0.027529,-0.496247,0.359811,1,1,-0.916906,0.346757,1,2.661433,3.024153,0.0,0.279477,0.0,0.0,1.05192,0.511317,1.009563,6.334764,0.715041
4,-1.176823,-0.027529,-0.496247,1.169821,1,1,-0.936163,0.164864,1,4.634793,2.749388,0.0,0.0,0.0,0.0,3.866509,3.133092,1.199722,5.059844,0.731147
