In [9]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder,MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
dir_path='../data/preprocessed_data'


In [13]:
def feature_engineering(data_path):
    # ディレクトリ内のデータの読み込み
    df=pd.read_csv(data_path, index_col=0)
    label_df=df['dengue']
    df =df.drop('dengue', axis=1)

    # 正規化
    numerical_columns = df.columns.to_list()
    numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
    data_preprocessed = preprocessor.fit_transform(df)

    # 統合データフレーム
    features_df = pd.DataFrame()
    features_df.columns = features_df.columns.astype(str)   #カラム名をstr型に
    features_df=pd.DataFrame()

    # クラスタリング
    kmeans = KMeans(n_clusters=2, random_state=0).fit(data_preprocessed)
    features_df['kmeans_dist'] = kmeans.transform(data_preprocessed).min(axis=1)
    features_df['kmeans_cluster'] = kmeans.labels_.astype(str)

    dbscan = DBSCAN(eps=0.5, min_samples=2).fit(data_preprocessed)
    features_df['dbscan_cluster'] = dbscan.labels_.astype(str)

    # 次元削減
    pca = PCA(n_components=4, random_state=0)
    pca_features = pca.fit_transform(data_preprocessed)
    features_df['pca_1'] = pca_features[:, 0]
    features_df['pca_2'] = pca_features[:, 1]
    features_df['pca_3'] = pca_features[:, 2]
    features_df['pca_4'] = pca_features[:, 3]

    # 異常検知
    isolation_forest = IsolationForest(contamination=0.1, random_state=0)
    features_df['anomaly_score'] = isolation_forest.fit_predict(data_preprocessed)

    # 距離・類似度ベース
    cos_sim_matrix = cosine_similarity(data_preprocessed)
    nearest_distances = cos_sim_matrix.mean(axis=1)
    features_df['nearest_cosine_similarity'] = nearest_distances

    # ラベルと結合
    learning_df=pd.concat([features_df, label_df], axis=1)
        
    return learning_df

In [14]:
learnin_df=feature_engineering('../data/preprocessed_data/data1.csv')

In [15]:
learnin_df.head()

Unnamed: 0,kmeans_dist,kmeans_cluster,dbscan_cluster,pca_1,pca_2,pca_3,pca_4,anomaly_score,nearest_cosine_similarity,dengue
0,1.759021,0,0,0.869669,0.206458,-0.506251,0.291727,1.0,0.493044,1.0
1,1.809409,0,-1,0.428689,-0.457575,0.511395,-0.463499,1.0,0.478332,0.0
2,1.844575,0,1,0.019978,0.662354,0.22509,0.576998,1.0,0.508765,0.0
3,1.838458,0,-1,0.20758,-0.470573,0.719875,0.548034,1.0,0.489903,0.0
4,1.781878,0,-1,0.561811,-0.552607,0.187506,-0.619805,-1.0,0.422482,1.0


In [8]:
learnin_df.to_csv('../data/learning_data.csv')