In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata

# from compare import Compare


In [74]:
columns = [
            "ID","Diagnosis","radius1","texture1","perimeter1","area1","smoothness1","compactness1",
            "concavity1","concave_points1","symmetry1","fractal_dimension1","radius2","texture2","perimeter2",
            "area2","smoothness2","compactness2","concavity2","concave_points2","symmetry2","fractal_dimension2",
            "radius3","texture3","perimeter3","area3","smoothness3","compactness3","concavity3","concave_points3",
            "symmetry3","fractal_dimension3"]

In [75]:
def visualizeData(data,feature,plottype = "box"):
    index_array = np.arange(0,len(data),1)
    if plottype == "box":
        plt.figure(figsize=(3,3))
        plt.boxplot(data)
        plt.show()
    elif plottype == "hist":
        plt.figure(figsize=(3,3))
        plt.hist(data,bins=100, density=True)
        plt.show()
    elif plottype == "scatter":
        plt.figure(figsize=(3,3))
        plt.scatter(index_array, data)
        plt.show()
    elif plottype == "all":
        fig, axis = plt.subplots(3,1, figsize=(10,5))

        fig.suptitle(f"plots for {feature}")
        axis[0].boxplot(data)
        axis[0].set_title("Boxplot")

        axis[1].hist(data,bins = 100)
        axis[1].set_title("Histogram (Distribution)")

        axis[2].scatter(index_array, data)
        axis[2].set_title("Scatterplot")
        
    elif plottype == "norm":
        stats.probplot(data,dist="norm",plot=plt)
        plt.show()

In [76]:
data = pd.read_csv("breast+cancer+wisconsin+diagnostic/wdbc.data")
data.columns = columns
data["Diagnosis"] = data["Diagnosis"].map({"M":1, "B":0}) #Change values under col "Diagnosis" from (M,B) to (1,0).
data.drop(["ID"], axis=1,inplace=True)


In [77]:
meta = Metadata.detect_from_dataframe(data)
GAN = CTGANSynthesizer(meta, epochs=5000)
GAN.fit(data)



In [78]:
synthetic_data = GAN.sample(num_rows=568) #568

In [93]:
class Compare:
    def __init__(self, colname = None, synt_data = None, real_data = None):
        self.synt_data = synt_data
        self.real_data = real_data
        self.cols = [
            "Diagnosis","radius1","texture1","perimeter1","area1","smoothness1","compactness1",
            "concavity1","concave_points1","symmetry1","fractal_dimension1","radius2","texture2","perimeter2",
            "area2","smoothness2","compactness2","concavity2","concave_points2","symmetry2","fractal_dimension2",
            "radius3","texture3","perimeter3","area3","smoothness3","compactness3","concavity3","concave_points3",
            "symmetry3","fractal_dimension3"]
        
    def set_real(self, real_data):
        self.real_data = real_data
        return None
    
    def set_synt(self, synt_data):
        self.synt_data = synt_data
        return None

    def box(self,colname):
        fig, axis = plt.subplots(2,1, figsize = (10,5))
        axis[0].boxplot(self.real_data[colname])
        axis[0].set_title("Real data")

        axis[1].boxplot(self.synt_data[colname])
        axis[1].set_title("synthetic data")


    def hist(self,colname):

        fig, axis = plt.subplots(3,1, figsize = (10,5), sharex=True, sharey=True)
        fig.suptitle(f"Historgram for {colname}")
        axis[0].hist(self.real_data[colname],bins=100)
        axis[0].legend(["Real data"])

        axis[1].hist(self.synt_data[colname],color="red", bins=100)
        axis[1].legend(["synthetic data"])

        axis[2].hist(self.synt_data[colname],color="red",bins=100, alpha = 0.5)
        axis[2].hist(self.real_data[colname],color="blue",bins=100, alpha = 0.5)
        axis[2].legend(["synthetic Data","Real Data"])

    def calculate_all(self, colname = None):

        wasserstein_list = self.calculate_wasserstein(colname)
        mean_list = self.calculate_mean(colname)
        std_list = self.calculate_std(colname)
        min_list = self.calculate_min(colname)
        max_list = self.calculate_max(colname)

        # self.calculate_correclations(colname)
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)

        print("wasserstein, mean, std, min, max")
        for index, col in enumerate(cols):
            print(f"Real: {wasserstein_list[index]}{mean_list[index][0]}{std_list[0]}{min_list[0]}{max_list[0]}") #fix this
            print(f"synthetic:{wasserstein_list[index]}{mean_list[index][1]}{std_list[1]}{min_list[1]}{max_list[1]}") #fix this
        

    def calculate_wasserstein(self, colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)


        wasser_list = []
        for col in cols:
            wasser_list.append(stats.wasserstein_distance(self.real_data[col], self.synt_data[col]))

        return wasser_list

    

    def calculate_mean(self,colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)
        mean_list = []
        for col in cols:
            mean_list.append([self.real_data[col].mean(), self.synt_data[col].mean()])
        
        return mean_list


    def calculate_std(self,colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)
        std_list = []
        for col in cols:
            std_list.append([self.real_data[col].std(), self.synt_data[col].std()])
        return std_list

        
    def calculate_min(self,colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)
        min_list = []
        for col in cols:
            min_list.append([self.real_data[col].min(), self.synt_data[col].min()])
        return min_list

    
    def calculate_max(self,colname):
        if colname == None:
            cols = self.cols
        else:
            cols = list(colname)
        max_list = []
        for col in cols:
            max_list.append([self.real_data[col].max(), self.synt_data[col].max()])
        return max_list


    def calculate_correclations(colname):
        pass


In [94]:

cp = Compare(real_data=data,synt_data=synthetic_data)
cp.calculate_all()

wasserstein, mean, std, min, max
Real: 0.11091549295774650.3714788732394366[0.48362599741223444, 0.5001303950993612][0, 0][1, 1]
synthetic:0.11091549295774650.4823943661971831[3.5234162139180856, 4.50774131449985][6.981, 8.591][28.11, 28.11]
Real: 1.118214788732394314.1204911971831[0.48362599741223444, 0.5001303950993612][0, 0][1, 1]
synthetic:1.118214788732394315.190427816901408[3.5234162139180856, 4.50774131449985][6.981, 8.591][28.11, 28.11]
Real: 3.16232394366197219.305334507042254[0.48362599741223444, 0.5001303950993612][0, 0][1, 1]
synthetic:3.16232394366197222.463045774647888[3.5234162139180856, 4.50774131449985][6.981, 8.591][28.11, 28.11]
Real: 6.106144366197181591.91475352112677[0.48362599741223444, 0.5001303950993612][0, 0][1, 1]
synthetic:6.106144366197181598.02089788732395[3.5234162139180856, 4.50774131449985][6.981, 8.591][28.11, 28.11]
Real: 61.7213028169014654.2797535211268[0.48362599741223444, 0.5001303950993612][0, 0][1, 1]
synthetic:61.7213028169014652.4901408450704[

508    0.05202
481    0.05216
32     0.05217
263    0.05228
82     0.05272
        ...   
152    0.09744
265    0.09744
17     0.09744
538    0.09744
199    0.09744
Name: fractal_dimension1, Length: 568, dtype: float64
