In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
def feature_extract(df):
    # 説明変数とラベルの分離
    label_df = df['dengue']
    df = df.drop('dengue', axis=1)

    # もとのdfの標準化
    numerical_columns = df.columns.tolist()
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
    df_normalized = preprocessor.fit_transform(df)
    df = pd.DataFrame(df_normalized, columns=numerical_columns)  # カラム名を維持

    # 特徴量の格納用DataFrame
    features_df = pd.DataFrame()

    # クラスタリング
    kmeans = KMeans(n_clusters=5, random_state=0).fit(df)
    features_df['kmeans_dist'] = kmeans.transform(df).min(axis=1)
    features_df['kmeans_cluster'] = kmeans.labels_
    

    dbscan = DBSCAN(eps=0.1, min_samples=1).fit(df)
    features_df['dbscan_cluster'] = dbscan.labels_

    # 次元削減
    pca = PCA(n_components=2, random_state=0)
    pca_features = pca.fit_transform(df)
    features_df['pca_1'] = pca_features[:, 0]
    features_df['pca_2'] = pca_features[:, 1]
    
    
    # 異常検知
    isolation_forest = IsolationForest(contamination=0.1, random_state=0)
    features_df['anomaly_score'] = isolation_forest.fit_predict(df)

    # 距離・類似度ベース
    cos_sim_matrix = cosine_similarity(df)
    nearest_distances = cos_sim_matrix.mean(axis=1)
    features_df['nearest_cosine_similarity'] = nearest_distances
    
    #学習用dfの正規化
    numerical_columns = features_df.columns.tolist()
    numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
    df_normalized = preprocessor.fit_transform(features_df)
    features_df = pd.DataFrame(df_normalized, columns=numerical_columns)  # カラム名を維持

    # 学習用データフレームの統合
    learning_df = pd.concat([features_df, label_df.reset_index(drop=True)], axis=1)


    return learning_df
    
    

In [3]:
def feature_extract2(df):
    # 説明変数とラベルの分離
    label_df = df['dengue']
    df = df.drop('dengue', axis=1)

    # もとのdfの標準化
    numerical_columns = df.columns.tolist()
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
    df_normalized = preprocessor.fit_transform(df)
    df_normal = pd.DataFrame(df_normalized, columns=numerical_columns)  # カラム名を維持

    # 特徴量の格納用DataFrame
    features_df = pd.DataFrame()

    # クラスタリング
    kmeans = KMeans(n_clusters=4, random_state=0).fit(df_normal)
    features_df['kmeans_dist'] = kmeans.transform(df_normal).min(axis=1)
    features_df['kmeans_cluster'] = kmeans.labels_
    

    dbscan = DBSCAN(eps=0.1, min_samples=1).fit(df_normal)
    features_df['dbscan_cluster'] = dbscan.labels_

    # 次元削減
    pca = PCA(n_components=4, random_state=0)
    pca_features = pca.fit_transform(df_normal)
    features_df['pca_1'] = pca_features[:, 0]
    features_df['pca_2'] = pca_features[:, 1]
    features_df['pca_3'] = pca_features[:, 2]
    features_df['pca_4'] = pca_features[:, 3]
    
    # 異常検知
    isolation_forest = IsolationForest(contamination=0.1, random_state=0)
    features_df['anomaly_score'] = isolation_forest.fit_predict(df_normal)

    # 距離・類似度ベース
    cos_sim_matrix = cosine_similarity(df_normal)
    nearest_distances = cos_sim_matrix.mean(axis=1)
    features_df['nearest_cosine_similarity'] = nearest_distances
    
    #学習用dfの正規化
    numerical_columns = features_df.columns.tolist()
    numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
    df_normalized = preprocessor.fit_transform(features_df)
    features_df = pd.DataFrame(df_normalized, columns=numerical_columns)  # カラム名を維持

    # 学習用データフレームの統合
    learning_df = pd.concat([df,features_df, label_df.reset_index(drop=True)], axis=1)


    return learning_df
    
    

In [4]:
df1=pd.read_csv('../data/preprocessed_data/df1.csv',index_col=0)
df2=pd.read_csv('../data/preprocessed_data/df2.csv',index_col=0)
df3=pd.read_csv('../data/preprocessed_data/df3.csv',index_col=0)
df4=pd.read_csv('../data/preprocessed_data/df4.csv',index_col=0)
df5=pd.read_csv('../data/preprocessed_data/df5.csv',index_col=0)




In [5]:
learning_df1=feature_extract(df1)
learning_df2=feature_extract(df2)


In [6]:
learning_df=pd.concat([learning_df1,learning_df2],axis=0)
learning_df.to_csv('../data/learning_data.csv')

In [7]:
learning_df

Unnamed: 0,kmeans_dist,kmeans_cluster,dbscan_cluster,pca_1,pca_2,anomaly_score,nearest_cosine_similarity,dengue
0,0.807903,1.00,0.000000,0.690117,0.530614,1.0,0.248749,1.0
1,0.872944,0.25,0.000378,0.548990,0.361146,1.0,0.372575,0.0
2,0.811521,0.00,0.000755,0.402503,0.733854,1.0,0.528842,0.0
3,0.809303,1.00,0.001133,0.472950,0.346633,0.0,0.441506,0.0
4,0.815355,0.00,0.001511,0.601353,0.324567,1.0,0.329382,1.0
...,...,...,...,...,...,...,...,...
408,0.179921,0.75,1.000000,0.766380,0.234948,1.0,0.410607,0.0
409,0.123779,0.75,0.656863,0.942497,0.030640,1.0,0.622803,0.0
410,0.216380,0.75,0.794118,0.757859,0.355427,1.0,0.202073,1.0
411,0.093152,0.75,0.196078,0.693195,0.077580,1.0,0.881248,0.0
