In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata

# from compare import Compare


In [25]:
columns = [
            "ID","Diagnosis","radius1","texture1","perimeter1","area1","smoothness1","compactness1",
            "concavity1","concave_points1","symmetry1","fractal_dimension1","radius2","texture2","perimeter2",
            "area2","smoothness2","compactness2","concavity2","concave_points2","symmetry2","fractal_dimension2",
            "radius3","texture3","perimeter3","area3","smoothness3","compactness3","concavity3","concave_points3",
            "symmetry3","fractal_dimension3"]


32


In [5]:
def visualizeData(data,feature,plottype = "box"):
    index_array = np.arange(0,len(data),1)
    if plottype == "box":
        plt.figure(figsize=(3,3))
        plt.boxplot(data)
        plt.show()
    elif plottype == "hist":
        plt.figure(figsize=(3,3))
        plt.hist(data,bins=100, density=True)
        plt.show()
    elif plottype == "scatter":
        plt.figure(figsize=(3,3))
        plt.scatter(index_array, data)
        plt.show()
    elif plottype == "all":
        fig, axis = plt.subplots(3,1, figsize=(10,5))

        fig.suptitle(f"plots for {feature}")
        axis[0].boxplot(data)
        axis[0].set_title("Boxplot")

        axis[1].hist(data,bins = 100)
        axis[1].set_title("Histogram (Distribution)")

        axis[2].scatter(index_array, data)
        axis[2].set_title("Scatterplot")
        
    elif plottype == "norm":
        stats.probplot(data,dist="norm",plot=plt)
        plt.show()

In [20]:
data = pd.read_csv("breast+cancer+wisconsin+diagnostic/wdbc.data")
data.columns = columns
# data["Diagnosis"] = data["Diagnosis"].map({"M":1, "B":0}) #Change values under col "Diagnosis" from (M,B) to (1,0).
data.drop(["ID"], axis=1,inplace=True)


In [22]:
meta = Metadata.detect_from_dataframe(data)
meta.visualize()
GAN = CTGANSynthesizer(meta, epochs=200)
GAN.fit(data)



In [23]:
synthetic_data = GAN.sample(num_rows=568) #568

In [17]:
class Compare:
    def __init__(self, colname = None, synt_data = None, real_data = None):
        self.synt_data = synt_data
        self.real_data = real_data
        self.cols = [
            "Diagnosis","radius1","texture1","perimeter1","area1","smoothness1","compactness1",
            "concavity1","concave_points1","symmetry1","fractal_dimension1","radius2","texture2","perimeter2",
            "area2","smoothness2","compactness2","concavity2","concave_points2","symmetry2","fractal_dimension2",
            "radius3","texture3","perimeter3","area3","smoothness3","compactness3","concavity3","concave_points3",
            "symmetry3","fractal_dimension3"]
        
    def set_real(self, real_data):
        self.real_data = real_data
        return None
    
    def set_synt(self, synt_data):
        self.synt_data = synt_data
        return None

    def box(self,colname):
        fig, axis = plt.subplots(2,1, figsize = (10,5))
        axis[0].boxplot(self.real_data[colname])
        axis[0].set_title("Real data")

        axis[1].boxplot(self.synt_data[colname])
        axis[1].set_title("synthetic data")


    def hist(self,colname):

        fig, axis = plt.subplots(3,1, figsize = (10,5), sharex=True, sharey=True)
        fig.suptitle(f"Historgram for {colname}")
        axis[0].hist(self.real_data[colname],bins=100)
        axis[0].legend(["Real data"])

        axis[1].hist(self.synt_data[colname],color="red", bins=100)
        axis[1].legend(["synthetic data"])

        axis[2].hist(self.synt_data[colname],color="red",bins=100, alpha = 0.5)
        axis[2].hist(self.real_data[colname],color="blue",bins=100, alpha = 0.5)
        axis[2].legend(["synthetic Data","Real Data"])

    def calculate_all(self, colname = None):

        wasserstein_list = self.calculate_wasserstein(colname)
        mean_list = self.calculate_mean(colname)
        std_list = self.calculate_std(colname)
        min_list = self.calculate_min(colname)
        max_list = self.calculate_max(colname)

        # self.calculate_correclations(colname)
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)

        print("wasserstein, mean, std, min, max")
        for index, col in enumerate(cols):
            print(f"----------------{col}----------------")
            print(f"Real: {round(wasserstein_list[index],4)}; {round(mean_list[index][0],4)}; {round(std_list[index][0],4)}; {round(min_list[index][0],4)}; {round(max_list[index][0],4)}")
            print(f"synthetic: {round(wasserstein_list[index],4)}; {round(mean_list[index][1],4)}; {round(std_list[index][1],4)}; {round(min_list[index][1],4)}; {round(max_list[index][1],4)}")
        

    def calculate_wasserstein(self, colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)


        wasser_list = []
        for col in cols:
            wasser_list.append(stats.wasserstein_distance(self.real_data[col], self.synt_data[col]))

        return wasser_list

    

    def calculate_mean(self,colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)
        mean_list = []
        for col in cols:
            mean_list.append([self.real_data[col].mean(), self.synt_data[col].mean()])
        
        return mean_list


    def calculate_std(self,colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)
        std_list = []
        for col in cols:
            std_list.append([self.real_data[col].std(), self.synt_data[col].std()])
        return std_list

        
    def calculate_min(self,colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)
        min_list = []
        for col in cols:
            min_list.append([self.real_data[col].min(), self.synt_data[col].min()])
        return min_list

    
    def calculate_max(self,colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)
        max_list = []
        for col in cols:
            max_list.append([self.real_data[col].max(), self.synt_data[col].max()])
        return max_list


    def calculate_correclations_f2f(colname):
        """
            Calculates the correlation between features to features. (Correlation within). 
        """
    
    def calculate_correlations_f2t(colname):
        """
            Calculates the correlation between features in relation to the target (Diagnosis)
        """
        pass


In [24]:

# cp = Compare(real_data=data,synt_data=synthetic_data)
# cp.calculate_all()

synthetic_data.head()

Unnamed: 0,Diagnosis,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,B,21.01,20.38,64.88,1034.3,0.05567,0.0667,0.009483,0.031063,0.1551,...,17.885,22.34,117.09,795.5,0.08643,0.18206,0.712231,0.08037,0.2104,0.11246
1,M,18.492,23.65,43.79,1381.4,0.11602,0.0993,0.0,0.028273,0.1528,...,19.038,25.46,163.89,1856.7,0.11555,0.62648,0.92707,0.078699,0.2213,0.08246
2,B,15.004,26.22,97.36,1515.8,0.11435,0.05007,0.251244,0.045546,0.1579,...,13.03,27.28,90.06,528.1,0.08438,0.30265,0.429394,0.04341,0.3008,0.1072
3,M,16.249,15.19,97.85,408.5,0.08638,0.17335,0.10985,0.075634,0.1209,...,22.986,46.1,66.84,420.2,0.0718,0.27455,0.406785,0.22346,0.1565,0.07235
4,B,13.137,17.37,70.1,1000.7,0.08144,0.12748,0.0,0.016878,0.1178,...,16.551,17.09,102.86,1757.2,0.13961,0.29916,0.457247,0.0,0.2104,0.07168
