In [166]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [167]:
#教師データの作成
#テーブルデータの取得
df=pd.read_csv('.././data/preprocessed_data/df1.csv',index_col=0)
#ラベルの取得
label_df=df['dengue']
df=df.drop('dengue',axis=1)

In [168]:
#正規化
numerical_columns =df.columns.to_list()
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
data_preprocessed = preprocessor.fit_transform(df)

In [169]:
# 統合データフレーム
features_df = pd.DataFrame(data_preprocessed)
features_df.columns = features_df.columns.astype(str)   #カラム名をstr型に

In [170]:
features_df

Unnamed: 0,0,1,2
0,-0.422272,-1.461331,-0.496247
1,1.567248,-0.027529,-0.496247
2,0.812010,-1.461331,2.015125
3,0.351605,-0.027529,-0.496247
4,-1.176823,-0.027529,-0.496247
...,...,...,...
4995,1.383145,-1.461331,-0.496247
4996,-1.303332,-0.027529,-0.496247
4997,-0.583517,1.406273,-0.496247
4998,1.101202,-0.027529,-0.496247


In [171]:
#クラスタリング
kmeans = KMeans(n_clusters=2, random_state=0).fit(features_df)
features_df['kmeans_dist'] = kmeans.transform(features_df).min(axis=1)
features_df['kmeans_cluster'] = kmeans.labels_

dbscan = DBSCAN(eps=0.5, min_samples=2).fit(features_df)
features_df['dbscan_cluster'] = dbscan.labels_

In [172]:
#次元削減
pca = PCA(n_components=2, random_state=0)
pca_features = pca.fit_transform(features_df)
features_df['pca_1'] = pca_features[:, 0]
features_df['pca_2'] = pca_features[:, 1]

In [173]:
#異常検知
isolation_forest = IsolationForest(contamination=0.1, random_state=0)
features_df['anomaly_score'] = isolation_forest.fit_predict(features_df)

In [174]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#自己符号化器
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 10),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(10, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

input_dim = features_df.shape[1]
autoencoder = Autoencoder(input_dim).to(device) 
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.01)

# データをテンソルに変換し、GPUに転送
data_tensor = torch.tensor(features_df.values, dtype=torch.float32).to(device)

# 訓練
num_epochs = 500
for epoch in range(num_epochs):
    output = autoencoder(data_tensor)
    loss = criterion(output, data_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# エンコード層の出力を取得
with torch.no_grad():
    encoded_features = autoencoder.encoder(data_tensor).cpu().numpy()  # 出力をCPUに戻してnumpy配列に変換

# DataFrameにエンコードされた特徴量を追加
encoded_df = pd.DataFrame(encoded_features, columns=[f'autoencoder_{i}' for i in range(encoded_features.shape[1])])
features_df = pd.concat([features_df, encoded_df], axis=1)

Epoch [20/500], Loss: 1.0812
Epoch [40/500], Loss: 0.9618
Epoch [60/500], Loss: 0.9017
Epoch [80/500], Loss: 0.8810
Epoch [100/500], Loss: 0.8746
Epoch [120/500], Loss: 0.8707
Epoch [140/500], Loss: 0.8678
Epoch [160/500], Loss: 0.8660
Epoch [180/500], Loss: 0.8648
Epoch [200/500], Loss: 0.8641
Epoch [220/500], Loss: 0.8636
Epoch [240/500], Loss: 0.8629
Epoch [260/500], Loss: 0.8608
Epoch [280/500], Loss: 0.8575
Epoch [300/500], Loss: 0.8556
Epoch [320/500], Loss: 0.8548
Epoch [340/500], Loss: 0.8543
Epoch [360/500], Loss: 0.8541
Epoch [380/500], Loss: 0.8539
Epoch [400/500], Loss: 0.8538
Epoch [420/500], Loss: 0.8537
Epoch [440/500], Loss: 0.8536
Epoch [460/500], Loss: 0.8535
Epoch [480/500], Loss: 0.8534
Epoch [500/500], Loss: 0.8533


In [175]:
#距離・類似度ベース
cos_sim_matrix = cosine_similarity(features_df)
nearest_distances = cosine_similarity(features_df).mean(axis=1)
features_df['nearest_cosine_similarity'] = nearest_distances

In [176]:
features_df.head()

Unnamed: 0,0,1,2,kmeans_dist,kmeans_cluster,dbscan_cluster,pca_1,pca_2,anomaly_score,autoencoder_0,autoencoder_1,autoencoder_2,autoencoder_3,autoencoder_4,autoencoder_5,autoencoder_6,autoencoder_7,autoencoder_8,autoencoder_9,nearest_cosine_similarity
0,-0.422272,-1.461331,-0.496247,1.517383,1,0,-2.240731,-0.852707,1,7.080877,1.671074,1.185319,5.906095,4.547328,0.0,2.119775,3.707088,1.856361,0.0,0.629762
1,1.567248,-0.027529,-0.496247,1.574743,1,1,-0.887794,0.472632,1,1.889979,0.0,2.67327,4.149865,1.346533,0.0,3.481316,4.186854,5.978469,1.927516,0.695363
2,0.81201,-1.461331,2.015125,1.664053,0,2,0.624591,-2.424086,-1,3.101645,8.757604,0.0,0.888755,3.108828,2.355237,9.631032,0.0,0.384984,2.762076,0.417387
3,0.351605,-0.027529,-0.496247,0.359811,1,1,-0.916906,0.346757,1,2.379273,0.0,3.071661,3.052297,2.600427,0.0,0.768008,3.391227,3.424747,0.345565,0.718419
4,-1.176823,-0.027529,-0.496247,1.169821,1,1,-0.936163,0.164864,1,4.632235,0.280644,3.961126,2.341447,3.819757,2.371269,0.0,3.871446,1.679346,1.061355,0.716612


In [177]:
learnin_data=pd.concat([features_df,label_df],axis=1)
learnin_data.head()

Unnamed: 0,0,1,2,kmeans_dist,kmeans_cluster,dbscan_cluster,pca_1,pca_2,anomaly_score,autoencoder_0,...,autoencoder_2,autoencoder_3,autoencoder_4,autoencoder_5,autoencoder_6,autoencoder_7,autoencoder_8,autoencoder_9,nearest_cosine_similarity,dengue
0,-0.422272,-1.461331,-0.496247,1.517383,1,0,-2.240731,-0.852707,1,7.080877,...,1.185319,5.906095,4.547328,0.0,2.119775,3.707088,1.856361,0.0,0.629762,0
1,1.567248,-0.027529,-0.496247,1.574743,1,1,-0.887794,0.472632,1,1.889979,...,2.67327,4.149865,1.346533,0.0,3.481316,4.186854,5.978469,1.927516,0.695363,0
2,0.81201,-1.461331,2.015125,1.664053,0,2,0.624591,-2.424086,-1,3.101645,...,0.0,0.888755,3.108828,2.355237,9.631032,0.0,0.384984,2.762076,0.417387,0
3,0.351605,-0.027529,-0.496247,0.359811,1,1,-0.916906,0.346757,1,2.379273,...,3.071661,3.052297,2.600427,0.0,0.768008,3.391227,3.424747,0.345565,0.718419,1
4,-1.176823,-0.027529,-0.496247,1.169821,1,1,-0.936163,0.164864,1,4.632235,...,3.961126,2.341447,3.819757,2.371269,0.0,3.871446,1.679346,1.061355,0.716612,1


In [178]:
learnin_data.to_csv('.././data/leatning_data.csv')