In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from scipy.stats import beta, bernoulli

In [13]:
df=pd.read_csv('../../data/preprocessed_data.csv')
df.head()

Unnamed: 0,fever,headache,muscle pain,joint pain,rash,nausea,vomiting,eye pain,abdominal pain,lymphadenopathy,chills,diarrhea,fatigue,dengue
0,0,1,0,0,1,1,0,0,1,1,1,0,1,1
1,1,0,1,1,1,1,0,1,0,1,0,0,1,0
2,1,0,0,0,0,1,1,0,1,0,1,0,1,0
3,1,0,0,1,0,1,0,1,1,1,0,1,0,0
4,1,0,0,0,1,0,1,1,1,1,0,1,0,1


In [14]:
for colum in df.columns:
    print(df[colum].value_counts())

fever
1    2817
0    1569
Name: count, dtype: int64
headache
0    2213
1    2173
Name: count, dtype: int64
muscle pain
0    2679
1    1707
Name: count, dtype: int64
joint pain
0    2457
1    1929
Name: count, dtype: int64
rash
0    2811
1    1575
Name: count, dtype: int64
nausea
0    2537
1    1849
Name: count, dtype: int64
vomiting
0    2417
1    1969
Name: count, dtype: int64
eye pain
0    2387
1    1999
Name: count, dtype: int64
abdominal pain
0    2741
1    1645
Name: count, dtype: int64
lymphadenopathy
0    2574
1    1812
Name: count, dtype: int64
chills
1    2473
0    1913
Name: count, dtype: int64
diarrhea
0    2871
1    1515
Name: count, dtype: int64
fatigue
0    2880
1    1506
Name: count, dtype: int64
dengue
0    2472
1    1914
Name: count, dtype: int64


In [15]:
# ベータ分布に基づいてバイナリ特徴量の生成を行う
# ベータ分布のパラメータ alpha, beta を決定するため、各特徴量の0,1の出現頻度を計算
n_samples = 5000  # 生成するサンプルの数
generated_data = []

for col in df.columns:
    positive_count = df[col].sum()  # 1の数
    negative_count = len(df[col]) - positive_count  # 0の数

    # ベータ分布のパラメータを計算 (ラプラススムージング: +1)
    alpha = positive_count + 1
    beta_param = negative_count + 1

    # ベータ分布に基づいてサンプルを生成 (1が出る確率を推定)
    probability = beta.rvs(alpha, beta_param)
    
    # 推定された確率に基づいてベルヌーイ分布から新しいサンプルを生成
    generated_feature = bernoulli.rvs(probability, size=n_samples)
    generated_data.append(generated_feature)

# 生成されたデータをDataFrameに変換
generated_df = pd.DataFrame(np.array(generated_data).T, columns=df.columns)

In [16]:
for colum in generated_df.columns:
    print(generated_df[colum].value_counts())

fever
1    3256
0    1744
Name: count, dtype: int64
headache
1    2532
0    2468
Name: count, dtype: int64
muscle pain
0    3144
1    1856
Name: count, dtype: int64
joint pain
0    2703
1    2297
Name: count, dtype: int64
rash
0    3192
1    1808
Name: count, dtype: int64
nausea
0    2948
1    2052
Name: count, dtype: int64
vomiting
0    2735
1    2265
Name: count, dtype: int64
eye pain
0    2632
1    2368
Name: count, dtype: int64
abdominal pain
0    3169
1    1831
Name: count, dtype: int64
lymphadenopathy
0    2907
1    2093
Name: count, dtype: int64
chills
1    2819
0    2181
Name: count, dtype: int64
diarrhea
0    3284
1    1716
Name: count, dtype: int64
fatigue
0    3188
1    1812
Name: count, dtype: int64
dengue
0    2782
1    2218
Name: count, dtype: int64


In [17]:
data_count=len(df)+len(generated_df)
data_count

9386

In [18]:
data=pd.concat([df,generated_df],axis=0, ignore_index=True)

In [19]:
data.to_csv('../../data/learning_data.csv',index=False)