In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
def feature_extract(df):
    # 説明変数とラベルの分離
    label_df = df['dengue']
    df = df.drop('dengue', axis=1)

    # もとのdfの正規化
    numerical_columns = df.columns.tolist()
    numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
    df_normalized = preprocessor.fit_transform(df)
    nomal_df= pd.DataFrame(df_normalized, columns=numerical_columns)  # カラム名を維持

    # 特徴量の格納用DataFrame
    features_df = pd.DataFrame()

    # クラスタリング
    kmeans = KMeans(n_clusters=10, random_state=0).fit(nomal_df)
    features_df['kmeans_dist'] = kmeans.transform(nomal_df).min(axis=1)
    features_df['kmeans_cluster'] = kmeans.labels_

    
    
    # 次元削減
    pca = PCA(n_components=10, random_state=0)
    pca_features = pca.fit_transform(nomal_df)
    features_df['pca_1'] = pca_features[:, 0]
    features_df['pca_2'] = pca_features[:, 1]
    features_df['pca_3'] = pca_features[:, 2]
    features_df['pca_4'] = pca_features[:, 3]
    features_df['pca_5'] = pca_features[:, 4]
    features_df['pca_6'] = pca_features[:, 5]
    features_df['pca_7'] = pca_features[:, 6]
    features_df['pca_8'] = pca_features[:, 7]
    features_df['pca_9'] = pca_features[:, 8]
    features_df['pca_10'] = pca_features[:,9]

    tsne = TSNE(n_components=3, random_state=0, perplexity=30)
    tsne_features = tsne.fit_transform(nomal_df)
    features_df['tsne_1'] = tsne_features[:, 0]
    features_df['tsne_2'] = tsne_features[:, 1]
    features_df['tsne_3'] = tsne_features[:, 2]
    

    
    # 距離・類似度ベース
    cos_sim_matrix = cosine_similarity(nomal_df)
    nearest_distances = cos_sim_matrix.mean(axis=1)
    features_df['nearest_cosine_similarity'] = nearest_distances


    
    #学習用dfの標準化
    numerical_columns = features_df.columns.tolist()
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_columns)])
    df_normalized = preprocessor.fit_transform(features_df)
    features_df = pd.DataFrame(df_normalized, columns=numerical_columns)  # カラム名を維持

    # 学習用データフレームの統合
    learning_df = pd.concat([df.reset_index(drop=True),features_df,label_df.reset_index(drop=True)], axis=1)


    return learning_df
    
    

In [3]:
df1=pd.read_csv('../data/preprocessed_data/df1.csv',index_col=0)
df2=pd.read_csv('../data/preprocessed_data/df2.csv',index_col=0)
df3=pd.read_csv('../data/preprocessed_data/df3.csv',index_col=0)
df4=pd.read_csv('../data/preprocessed_data/df4.csv',index_col=0)
df5=pd.read_csv('../data/preprocessed_data/df5.csv',index_col=0)




後にmedical bertで使用するためのデータ

In [4]:
row_data = learning_df=pd.concat([df1,df2],axis=0)
row_data.to_csv('../data/row_data.csv')


実際に使用するデータ

In [4]:
learning_df1=feature_extract(df1)
learning_df2=feature_extract(df2)


In [5]:
learning_df=pd.concat([learning_df1,learning_df2],axis=0)
learning_df.to_csv('stacking_model/learning_data.csv')

In [6]:
learning_df

Unnamed: 0,fever,headache,muscle pain,joint pain,rash,nausea,vomiting,eye pain,abdominal pain,lymphadenopathy,...,pca_6,pca_7,pca_8,pca_9,pca_10,tsne_1,tsne_2,tsne_3,nearest_cosine_similarity,dengue
0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.871928,-1.198259,-1.953684,0.465678,-1.001058,0.516306,0.552540,-0.512182,-0.005099,1.0
1,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,-0.369260,0.267164,-1.812076,2.485652,-0.287923,0.121641,-0.576519,0.603666,0.971852,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.145097,-0.061317,-1.322027,-0.042936,1.611824,-1.072705,-0.325399,-1.372784,0.204822,0.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.090913,-0.874147,1.323701,1.966422,0.561129,-0.245241,-0.874049,1.036044,0.568445,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.148020,-1.539669,1.700657,0.533482,0.013400,1.948557,-1.575037,0.650907,0.302657,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.399251,-0.036360,3.497924,-0.086333,0.078378,-0.759045,0.866572,-1.620569,0.414101,0.0
409,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,-0.784939,-0.293305,-0.316408,-0.029310,-0.213675,0.572830,-1.185400,-1.741749,0.315529,0.0
410,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,2.423950,-1.132150,1.758630,-0.050845,0.000730,-0.097876,0.102822,0.422938,0.142913,1.0
411,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,-0.402089,0.724196,-0.222296,0.051133,-0.218588,-0.622541,-1.603136,-0.055569,0.538169,0.0
