In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from joblib import dump, load
from sklearn.linear_model import LogisticRegression
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import scipy.stats as stats
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

# Checking the classification Power of the synth data created from data

Data Loading

In [2]:
df_train = pd.read_csv("../data/processed/sorted_train_data_mm.csv")

synthetic_data_paths = [
    # Test Data
    "../data/processed/sorted_test_data_mm.csv",
    # Base Model
    "../data/interim/generated/Base/id1_repeat_data.csv",
    "../data/interim/generated/Base/id1_repeat_sample.csv",
    # this is the experiment with the top values of the d-generation process
    "../data/interim/generated/D-Generation/3646_b_b0_1kl_0_0001_l_64_n512_data_no_batch.csv",
    "../data/interim/generated/D-Generation/467_a_b0_1kl_0_0001_l_4_n512_data_no_batch.csv",
    "../data/interim/generated/D-Generation/3646_b_b0_1kl_0_0001_l_64_n512_data_batch.csv",
    "../data/interim/generated/D-Generation/1393_b_b0_1kl_0_0001_l_32_n512_data_no_batch.csv",
    "../data/interim/generated/D-Generation/1393_b_b0_1kl_0_0001_l_32_n512_data_batch.csv",
    "../data/interim/generated/D-Generation/966_a_b0_1kl_0_0001_l_16_n256_data_no_batch.csv",
    "../data/interim/generated/D-Generation/2767_b_b0_1kl_0_0001_l_2_n256_128_data_no_batch.csv",
    # here are the values of the s-generation process only sampling
    "../data/interim/generated/S-Generation/1517_c_b1kl_1_l_2_n512_batch_sample_sample.csv",
    # back to cgan to compare
    "../data/interim/generated/CGANEPOCH2.csv",
    # Oversampling
    "../data/interim/generated/S-Generation/1517_c_b1kl_1_l_2_n512_oversample_02_sample.csv",
    "../data/interim/generated/S-Generation/1517_c_b1kl_1_l_2_n512_oversample_01_sample.csv",
    "../data/interim/generated/S-Generation/1517_c_b1kl_1_l_2_n512_oversample_005_sample.csv",
    "../data/interim/generated/S-Generation/1517_c_b1kl_1_l_2_n512_oversample_001_sample.csv",
]

file_names = [path.split("/")[-1].split(".")[0] for path in synthetic_data_paths]

In [3]:
synth_data_dict = {}

for path, file_name in zip(synthetic_data_paths, file_names):
    synth_data_dict[file_name] = {}
    synth_data = pd.read_csv(path)
    synth_data_dict[file_name]["X"] = synth_data.values[:, :-1]
    synth_data_dict[file_name]["y"] = synth_data.values[:, -1]

Train Random forest and Logistic Regression on original data

In [4]:
# # load train_data
# X = synth_data_dict["sorted_train_data_mm"]["X"]
# y = synth_data_dict["sorted_train_data_mm"]["y"]

# # Random Forest
# random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
# random_forest.fit(X, y)
# dump(random_forest, "../models/Classifier/random_forest_model.joblib")

# # Logistic Regression
# logistic_regression = LogisticRegression()
# logistic_regression.fit(X, y)
# dump(logistic_regression, "../models/Classifier/logistic_regression.joblib")

# load the models

In [5]:
random_forest = load("../models/Classifier/random_forest_model.joblib")
logistic_regression = load("../models/Classifier/logistic_regression.joblib")

In [6]:
def evaluate_model_performance_classification(model, X, y, name, *args):
    # Predictions on training and testing data
    y_pred = model.predict(X)
    # Accuracy scores
    accuracy = accuracy_score(y, y_pred)
    # Recall scores
    recall = recall_score(y, y_pred)
    # Precision
    precision = precision_score(y, y_pred)
    # F1 scores
    f1 = f1_score(y, y_pred)
    # Confusion Matrix
    conf_matrix = confusion_matrix(y, y_pred)

    mean_diff_list = []
    kl_div_list = []

    for column1, column2 in zip(
        np.transpose(synth_data_dict["sorted_test_data_mm"]["X"]),
        np.transpose(X)):

        mean_diff_list.append(np.mean(column1) - np.mean(column2))

        hist_test, _ = np.histogram(
            column1, bins=40
        )
        hist_prob_test = hist_test / len(column1)

        hist_synth, _ = np.histogram(
            column2, bins=40
        )
        hist_prob_synth = hist_synth / len(column2)

        kl_div = tf.keras.losses.kullback_leibler_divergence(hist_prob_test, hist_prob_synth)
        kl_div_list.append(kl_div.numpy())

    mean_class = sum(y)/len(y)  
    mean_diff = np.sum(mean_diff_list) / len(mean_diff_list)
    kl_div_value = np.sum(kl_div_list) / len(kl_div_list)

    # Create new row
    row =  [model, name, accuracy, recall, precision, f1, conf_matrix, mean_diff, mean_class, kl_div_value] #mean_f_value, p_value_low, p_value_up]

    return row

In [7]:
df_evaluation = pd.DataFrame(
    columns=[
        "Model",
        "Data Set",
        "Accuracy",
        "Recall",
        "Precision",
        "F1 Score",
        "Confusion Matrix",
        "mean_diff",
        "mean_class",
        "kl_div_value",
    ]
)

for key in synth_data_dict:
    X = synth_data_dict[key]["X"]
    y = synth_data_dict[key]["y"]
    evaluation_result = evaluate_model_performance_classification(random_forest, X, y, key)
    df_evaluation.loc[len(df_evaluation)] = evaluation_result

# for key in synth_data_dict:
#     X = synth_data_dict[key]["X"]
#     y = synth_data_dict[key]["y"]
#     evaluation_result = evaluate_model_performance_classification(
#         logistic_regression, X, y, key
#     )
#     df_evaluation.loc[len(df_evaluation)] = evaluation_result

In [8]:
df_evaluation.query(
    '`Data Set`.str.contains("sample")'
).sort_values("F1 Score", ascending=True).sort_values("F1 Score")

Unnamed: 0,Model,Data Set,Accuracy,Recall,Precision,F1 Score,Confusion Matrix,mean_diff,mean_class,kl_div_value
2,"(DecisionTreeClassifier(max_depth=10, max_feat...",id1_repeat_sample,0.999912,0.333333,0.666667,0.444444,"[[56955, 1], [4, 2]]",0.003517,0.000105,4.454869
10,"(DecisionTreeClassifier(max_depth=10, max_feat...",1517_c_b1kl_1_l_2_n512_batch_sample_sample,0.999702,0.564103,1.0,0.721311,"[[56923, 0], [17, 22]]",5e-05,0.000685,6.303054
13,"(DecisionTreeClassifier(max_depth=10, max_feat...",1517_c_b1kl_1_l_2_n512_oversample_01_sample,0.99526,0.811321,1.0,0.895833,"[[55531, 0], [270, 1161]]",-0.000104,0.025122,2.666527
15,"(DecisionTreeClassifier(max_depth=10, max_feat...",1517_c_b1kl_1_l_2_n512_oversample_001_sample,0.999368,1.0,0.829384,0.906736,"[[56751, 36], [0, 175]]",-0.000788,0.003072,4.228778
14,"(DecisionTreeClassifier(max_depth=10, max_feat...",1517_c_b1kl_1_l_2_n512_oversample_005_sample,0.998754,0.892922,0.97619,0.932701,"[[56399, 12], [59, 492]]",0.000691,0.009673,2.602489
12,"(DecisionTreeClassifier(max_depth=10, max_feat...",1517_c_b1kl_1_l_2_n512_oversample_02_sample,0.988712,0.930983,0.980502,0.955101,"[[49480, 136], [507, 6839]]",0.00442,0.128963,4.239298


In [9]:
df_evaluation.query("`Data Set`.str.contains('data')").sort_values("F1 Score", ascending=False)

Unnamed: 0,Model,Data Set,Accuracy,Recall,Precision,F1 Score,Confusion Matrix,mean_diff,mean_class,kl_div_value
9,"(DecisionTreeClassifier(max_depth=10, max_feat...",2767_b_b0_1kl_0_0001_l_2_n256_128_data_no_batch,0.999807,0.934783,0.945055,0.939891,"[[56865, 5], [6, 86]]",0.000459,0.001615,6.421584
6,"(DecisionTreeClassifier(max_depth=10, max_feat...",1393_b_b0_1kl_0_0001_l_32_n512_data_no_batch,0.999579,0.836957,0.895349,0.865169,"[[56861, 9], [15, 77]]",-8.4e-05,0.001615,2.346215
0,"(DecisionTreeClassifier(max_depth=10, max_feat...",sorted_test_data_mm,0.999544,0.771739,0.934211,0.845238,"[[56865, 5], [21, 71]]",0.0,0.001615,0.0
4,"(DecisionTreeClassifier(max_depth=10, max_feat...",467_a_b0_1kl_0_0001_l_4_n512_data_no_batch,0.999473,0.75,0.907895,0.821429,"[[56863, 7], [23, 69]]",0.001596,0.001615,4.857539
3,"(DecisionTreeClassifier(max_depth=10, max_feat...",3646_b_b0_1kl_0_0001_l_64_n512_data_no_batch,0.999438,0.73913,0.894737,0.809524,"[[56862, 8], [24, 68]]",-8.5e-05,0.001615,2.322
1,"(DecisionTreeClassifier(max_depth=10, max_feat...",id1_repeat_data,0.999456,0.642857,0.981818,0.776978,"[[56877, 1], [30, 54]]",0.003515,0.001475,2.926744
8,"(DecisionTreeClassifier(max_depth=10, max_feat...",966_a_b0_1kl_0_0001_l_16_n256_data_no_batch,0.999333,0.684783,0.875,0.768293,"[[56861, 9], [29, 63]]",7.6e-05,0.001615,2.427511
5,"(DecisionTreeClassifier(max_depth=10, max_feat...",3646_b_b0_1kl_0_0001_l_64_n512_data_batch,0.999298,0.663043,0.871429,0.753086,"[[56861, 9], [31, 61]]",0.000828,0.001615,2.322265
7,"(DecisionTreeClassifier(max_depth=10, max_feat...",1393_b_b0_1kl_0_0001_l_32_n512_data_batch,0.999228,0.619565,0.863636,0.721519,"[[56861, 9], [35, 57]]",0.00079,0.001615,2.087815


# Checking the regression and correlation Power of the synth data created from data

# Calculate correlations

## Creation of the datasets

In [10]:
# Test_data dictionary
test_data_dict = {}

for key in synth_data_dict.keys():
    if len(synth_data_dict[key]["X"]) == 56962:
        X = synth_data_dict[key]["X"]
        test_data_dict[key] = X
    else:
        pass

# Train Data Dictionary
train_data_dict = {}

for key in synth_data_dict:
    if len(synth_data_dict[key]["X"]) == 199364:
        train_data_dict[key] = synth_data_dict[key]["X"]

In [11]:
# Sorted test data dictionary, because the data had an original ordering

test_data_dict = {}
sorted_test_data_dict = {}

for key in synth_data_dict.keys():
    # leaving out data, that has still or already has an ordering
    if key == "synthdata_from_data_test" or key == "sorted_test_data_mm":
        sorted_test_data_dict[key] = synth_data_dict[key]["X"]
    elif (
        len(synth_data_dict[key]["X"]) == 56962
    ):
        X = synth_data_dict[key]["X"]
        sorted_indices = np.argsort(X[:, 0])
        sorted_data = X[sorted_indices]
        sorted_test_data_dict[key] = sorted_data
    else:
        pass

## Columnwise Correlations between orginal data and synth data

In [12]:
# Calcualte the correlation for the sorted test data

for key in list(sorted_test_data_dict.keys()):
    corr_list = []
    for i in range(0, 30):
        corr = np.corrcoef(
            sorted_test_data_dict["sorted_test_data_mm"][:, i],
            sorted_test_data_dict[key][:, i],
        )[0, 1]
        corr_list.append(corr)
    mean_corr = np.mean(corr_list)
    print(f"Mean correlation for key {key}: {mean_corr}")

Mean correlation for key sorted_test_data_mm: 1.0
Mean correlation for key id1_repeat_data: 0.03248867810548032
Mean correlation for key id1_repeat_sample: 0.03275890536821562
Mean correlation for key 3646_b_b0_1kl_0_0001_l_64_n512_data_no_batch: 0.08407186745142803
Mean correlation for key 467_a_b0_1kl_0_0001_l_4_n512_data_no_batch: 0.10929746398109595
Mean correlation for key 3646_b_b0_1kl_0_0001_l_64_n512_data_batch: 0.08544166940477699
Mean correlation for key 1393_b_b0_1kl_0_0001_l_32_n512_data_no_batch: 0.08230100374766043
Mean correlation for key 1393_b_b0_1kl_0_0001_l_32_n512_data_batch: 0.08482614286981514
Mean correlation for key 966_a_b0_1kl_0_0001_l_16_n256_data_no_batch: 0.08454044661050411
Mean correlation for key 2767_b_b0_1kl_0_0001_l_2_n256_128_data_no_batch: 0.1174815616324841
Mean correlation for key 1517_c_b1kl_1_l_2_n512_batch_sample_sample: 0.01782774399852233
Mean correlation for key CGANEPOCH2: 0.014172250112513425
Mean correlation for key 1517_c_b1kl_1_l_2_n512

# Regression

In [13]:
def create_regression(X, y):
    model = LinearRegression()
    # Predictions on training and testing data
    fitted_regresion = model.fit(X, y)
    y_pred = fitted_regresion.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)

    return mse, r2, fitted_regresion

In [14]:
def evaluate_model_performance_regression(X, y, fitted_regression):
    y_pred = fitted_regression.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, r2

# Create the Regression for the test data

In [15]:
regression_dict_test = {}

mse_list = []
r2_list = []

for i in range(30):
    X = np.delete(sorted_test_data_dict["sorted_test_data_mm"], i, axis=1)
    y = sorted_test_data_dict["sorted_test_data_mm"][:, i]
    mse, r2, fitted_regression = create_regression(X, y)

    regression_dict_test[i] = {}
    regression_dict_test[i]["mse"] = mse
    regression_dict_test[i]["r2"] = r2
    regression_dict_test[i]["model"] = fitted_regression

    mse_list.append(mse)
    r2_list.append(r2)

print(f"The mean mse of all columns is {sum(mse_list)/len(mse_list)}")
print(f" The mean r2 of all columns is {sum(r2_list)/len(r2_list)}")

The mean mse of all columns is 0.0028647283347766517
 The mean r2 of all columns is 0.2368110425314998


In [16]:
test_eval_dict = {}

for key in sorted_test_data_dict.keys():
    test_eval_dict[key] = {}
    mse_list = []
    r2_list = []

    for i in range(30):
        X = np.delete(sorted_test_data_dict[key], i, axis=1)
        y = sorted_test_data_dict[key][:, i]
        model = regression_dict_test[i]["model"]
        mse, r2 = evaluate_model_performance_regression(X, y, model)
        mse_list.append(mse)
        r2_list.append(r2)
    mse_mean = sum(mse_list) / len(mse_list)
    r2_mean = sum(r2_list) / len(r2_list)

    test_eval_dict[key]["mse"] = mse_mean
    test_eval_dict[key]["r2"] = r2_mean
    test_eval_dict[key]["mse_list"] = mse_list
    test_eval_dict[key]["r2_list"] = r2_list
    correlation_mse = np.corrcoef(
        test_eval_dict["sorted_test_data_mm"]["mse_list"], test_eval_dict[key]["mse_list"]
    )[0, 1]
    test_eval_dict[key]["correlation_mse"] = correlation_mse
    correlation_r2 = np.corrcoef(
        test_eval_dict["sorted_test_data_mm"]["r2_list"], test_eval_dict[key]["r2_list"]
    )[0, 1]
    test_eval_dict[key]["correlation_r2"] = correlation_r2

    print(f"The mean mse value for all columns of table {key} is {mse_mean}")
    print(f"The mean r2 value for all columns  of table {key} is {r2_mean} ")
    print(f"The correlation for mse of all columns of table {key} is {correlation_mse}")
    print(f"The correlation for r2 of all columns of table {key} is {correlation_r2}")

The mean mse value for all columns of table sorted_test_data_mm is 0.0028647283347766517
The mean r2 value for all columns  of table sorted_test_data_mm is 0.2368110425314998 
The correlation for mse of all columns of table sorted_test_data_mm is 1.0
The correlation for r2 of all columns of table sorted_test_data_mm is 1.0
The mean mse value for all columns of table id1_repeat_data is 0.0009142621626083672
The mean r2 value for all columns  of table id1_repeat_data is -4.4836909648911485 
The correlation for mse of all columns of table id1_repeat_data is 0.8575681002593363
The correlation for r2 of all columns of table id1_repeat_data is -0.006144775574664058
The mean mse value for all columns of table id1_repeat_sample is 0.0009052873958459876
The mean r2 value for all columns  of table id1_repeat_sample is -4.4866841712154075 
The correlation for mse of all columns of table id1_repeat_sample is 0.8635005192631333
The correlation for r2 of all columns of table id1_repeat_sample is -0.