In [None]:
from data import input_data
from data_preparation import SmartBrain
from vgan import GAN
from synthetic import synthetic_data
from statistical_tests import mmd_rbf, Wilcoxon, Student_t, Kolmogorov_Smirnov
from statistical_tests import plot_histograms

In [None]:
df = input_data('Breast_cancer', decimal=',')
SB = SmartBrain() 
#encode categorical variables and 
#remove nan
df = SB.nbPreparation(df)
df.dtypes

In [None]:
""" 
#Arguments
GAN : Vanilla GAN
df : input data
samples: number of features in the
#Additional arguments
initial_lr = 0.0002(default), 
dropout = 0.5(default), 
epochs = 10(default)
#Output
newdf = Synthetic dataset with lenght=samples
"""
samples= 2000
newdf = synthetic_data(
    GAN, 
    df, 
    samples = samples
    )


In [None]:
"""
  MMD is a statistical tests to determine if two samples 
  are from different distributions. This statistic test 
  measures the distance between the means of the two samples 
  mapped into a reproducing kernel Hilbert space (RKHS).
  Maximum Mean Discrepancy has found numerous applications in 
  machine learning and nonparametric testing.

  [1] Ilya Tolstikhin, Bharath K. Sriperumbudur, and Bernhard Schölkopf. 2016. 
  Minimax estimation of maximum mean discrepancy with radial kernels. 
  In Proceedings of the 30th International Conference on Neural 
  Information Processing Systems (NIPS'16). 
  Curran Associates Inc., Red Hook, NY, USA, 1938–1946.

  [2] A. Gretton, K. M. Borgwardt, M. Rasch, B. Schölkopf, and A. Smola. 
  A kernel method for the two sample problem. 
  In B. Schölkopf, J. Platt, and T. Hoffman, editors, Advances in Neural
  Information Processing Systems 19, pages 513–520, Cambridge, MA, 2007. MIT Press.

      Args:

         X: ndarray of shape (n_samples_X, n_features)
         Y: ndarray of shape (n_samples_Y, n_features)
         gamma: float (default:None)

      Returns:
          Maximum Mean Discrepancy (MMD) value: float
"""
mmd_rbf(df, newdf, gamma=None)

In [None]:
"""
  The Wilcoxon signed-rank test tests the null hypothesis 
  that two related paired samples come from the same 
  distribution. In particular, it tests whether the 
  distribution of the differences x - y is symmetric 
  about zero.  Wilcoxon signed-rank test, which is the 
  nonparametric version of the paired Student’s t-test. 
  This test has less statistical power than the paired t-test,
  although more power when the expectations of the t-test 
  are violated, such as independence, when x or y does 
  not follows a normal distribution or when  x and y haven't 
  the same variance.
  H0 = null hypothesis is that data vectors 
  x1 and x2 are samples from the same 
  distribution. If p < 0.05 H0 is rejected
    Args:

          X: pd.DataFrame of shape (n_samples_X, n_features)
          Y: pd.DataFrame of shape (n_samples_Y, n_features)
                
      Returns:
          list 
            list of features + Wilcoxon signed-rank 
            test p values
"""
Wilcoxon(df, newdf)

In [None]:
"""Compares if the means are equivalent 
  with Student t-test. 
  H0 = null hypothesis  null hypothesis that 
  there is no effective difference between the 
  observed sample mean and the hypothesized 
  or stated population mean. 
  If p < 0.05, H0 is rejected
    Args:

          X: pd.DataFrame of shape (n_samples_X, n_features)
          Y: pd.DataFrame of shape (n_samples_Y, n_features)
                
      Returns:
          list 
            list of features + Student_t test p values
"""
Student_t(df, newdf)

In [None]:
"""Performs the two-sample Kolmogorov-Smirnov 
  test for goodness of fit. The one-sample test 
  compares the underlying distribution F(x) of 
  a sample against a given distribution G(x). 
  H0 = null hypothesis is that data vectors 
  x1 and x2 are from populations with the same 
  distribution. If p < 0.05 H0 is rejected
  Args:

          X: pd.DataFrame of shape (n_samples_X, n_features)
          Y: pd.DataFrame of shape (n_samples_Y, n_features)
                
      Returns:
          list 
            list of features an t-test p values
"""
Kolmogorov_Smirnov(df, newdf)

In [None]:
""" 
    Plot histograms
    Visually compare the distribution plots
    of each feature and shows the Wilcoxon 
    test values. Use as probability density
    as histnorm.
   
      Args:

          X: pd.DataFrame of shape(n_samples_X, n_features)
          Y: pd.DataFrame of shape (n_samples_Y, n_features)
                
      Returns:
          Plotly figure
"""
plot_histograms(df, newdf)

In [None]:
newdf.to_csv('Breast_cancer_synthetic2.csv')