#### Importações

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Configurações para exibição do DataFrame
pd.set_option('display.max_columns', None)  # Mostrar todas as colunas
pd.set_option('display.max_rows', None)     # Mostrar todas as linhas
pd.set_option('display.max_colwidth', None) # Mostrar todo o conteúdo das células

#### Importando os dataframes que serão analisados

In [2]:
df_original = pd.read_csv("../dados/nova_plataforma.csv")
df_gan = pd.read_csv("../dados/registros_gan.csv")

##### Removendo as colunas de df_original que não estão na df_gan e df_regessao

In [3]:
colunas_para_remover = ['platform', 'recommended (1 partial; 2 complete)', 'impacts_of_option-partial']
df_original.drop(colunas_para_remover, axis=1, inplace=True)

#### Analisando os registros gerados pela gan

##### Concatenado as estatisticas obtidas do df_original e df_gan

In [4]:
# Obtendo estatísticas para df_original e df_gan
describe_original = df_original.describe().transpose()
describe_gan = df_gan.describe().transpose()

# Removendo a coluna count, pois nesse caso apenas diz a quantidade de registros. 
# Portanto, não precisamos dela para a comparação
describe_original.drop(columns='count', inplace=True)
describe_gan.drop(columns='count', inplace=True)

# Adicionando sufixos aos nomes das colunas para distinguir entre os dois DataFrames
describe_original = describe_original.add_suffix('_original')
describe_gan = describe_gan.add_suffix('_gan')

# Criando DataFrame vazio para armazenar estatísticas intercaladas
original_x_gan = pd.DataFrame()

# Intercalando as colunas dos DataFrames originais e sintéticos
for coluna_original, coluna_gan in zip(describe_original.columns, describe_gan.columns):
    original_x_gan[coluna_original] = describe_original[coluna_original]
    original_x_gan[coluna_gan] = describe_gan[coluna_gan]


In [5]:
original_x_gan.head(25)

Unnamed: 0,mean_original,mean_gan,std_original,std_gan,min_original,min_gan,25%_original,25%_gan,50%_original,50%_gan,75%_original,75%_gan,max_original,max_gan
water_depth (m),124.428571,122.2517,22.307377,22.618711,99.0,99.0,107.5,100.0,116.0,113.0,142.5,148.0,156.0,155.0
weight (t),18815.714286,19162.6999,5472.264453,2994.90045,10054.0,10433.0,16530.5,17321.25,18584.0,19384.0,21182.0,21161.5,27647.0,27058.0
installation_date,38.857143,38.5444,5.639993,5.371156,31.0,31.0,34.5,33.0,41.0,39.0,42.5,44.0,46.0,45.0
type_of_production (1 oil and gas; 2 oil; 3 gas),1.571429,1.5978,0.786796,0.820062,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,3.0
number_of_legs,7.428571,6.6894,1.511858,0.82849,4.0,4.0,8.0,7.0,8.0,7.0,8.0,7.0,8.0,8.0
number_of_piles,17.142857,13.6073,10.106575,9.827499,5.0,5.0,8.5,5.0,20.0,9.0,23.0,23.0,32.0,31.0
height_of_jacket_or_sub-structure (m),137.5,127.770308,18.99342,16.045366,114.0,114.0,122.75,114.7928,136.0,119.7965,150.5,137.9888,166.0,165.945
distance_to_coast (km),218.571429,229.6228,66.271807,57.451207,120.0,120.0,180.0,178.0,240.0,260.0,264.0,280.0,282.0,281.0
risk_to_other_users-complete,0.428571,0.4621,0.534522,0.498586,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
risk_to_other_users-partial,7e-06,7e-06,1e-05,9e-06,0.0,0.0,0.0,2.537206e-09,7.8e-08,5.593135e-07,1.4e-05,1.810404e-05,2.3e-05,2.3e-05


#### Removendo registros que contém células que ultrapassam o intervalo da coluna (min e max)

In [6]:
def ultrapassou_intervalo(registro, coluna, df_original):
    if registro > df_original[coluna].max() or registro < df_original[coluna].min():
        return True
    else:
        return False

In [7]:
def remover_registros_fora_do_intervalo(df_gan, df_original):
    indices_para_remover = []

    for indice, linha in df_gan.iterrows():
        for coluna in df_gan.columns:
            if ultrapassou_intervalo(linha[coluna], coluna, df_original):
                indices_para_remover.append(indice)
                break  # Se um valor exceder o intervalo, já podemos excluir o registro inteiro
    df_gan_limpo = df_gan.drop(indices_para_remover)
    print(len(indices_para_remover))
    return df_gan_limpo

In [8]:
df_gan_limpo = remover_registros_fora_do_intervalo(df_gan, df_original)

0


In [9]:
df_gan_limpo.shape

(10000, 25)

#### Removendo outliers com base na amplitude interquartil

In [10]:
def remover_outliers_iqr(df_gan):
    indices_para_remover = []

    for indice, linha in df_gan.iterrows():
        for coluna in df_gan.columns:
            q1 = df_gan[coluna].quantile(0.25)
            q3 = df_gan[coluna].quantile(0.75)
            
            iqr = q3 - q1 # Amplitude interquantil
            limite_superior = q3 + 1.5 * iqr
            limite_inferior = q1 - 1.5 * iqr

            if linha[coluna] < limite_inferior or linha[coluna] > limite_superior:
                indices_para_remover.append(indice)
                break  # Se um valor exceder o intervalo, já podemos excluir o registro inteiro

    df_gan_limpo = df_gan.drop(indices_para_remover)
    print(len(indices_para_remover))
    return df_gan_limpo

In [11]:
df_gan_limpo = remover_outliers_iqr(df_gan)

5527


In [12]:
df_gan.shape, df_gan_limpo.shape

((10000, 25), (4473, 25))

In [13]:
df_gan_limpo.duplicated().sum()

0

#### Comparando novamente as bases de dados

In [14]:
# Obtendo estatísticas para df_original e df_gan
describe_original = df_original.describe().transpose()
describe_gan = df_gan_limpo.describe().transpose()

# Removendo a coluna count, pois nesse caso apenas diz a quantidade de registros. 
# Portanto, não precisamos dela para a comparação
describe_original.drop(columns='count', inplace=True)
describe_gan.drop(columns='count', inplace=True)

# Adicionando sufixos aos nomes das colunas para distinguir entre os dois DataFrames
describe_original = describe_original.add_suffix('_original')
describe_gan = describe_gan.add_suffix('_gan')

# Criando DataFrame vazio para armazenar estatísticas intercaladas
original_x_gan = pd.DataFrame()

# Intercalando as colunas dos DataFrames originais e sintéticos
for coluna_original, coluna_gan in zip(describe_original.columns, describe_gan.columns):
    original_x_gan[coluna_original] = describe_original[coluna_original]
    original_x_gan[coluna_gan] = describe_gan[coluna_gan]


In [15]:
original_x_gan.head(26)

Unnamed: 0,mean_original,mean_gan,std_original,std_gan,min_original,min_gan,25%_original,25%_gan,50%_original,50%_gan,75%_original,75%_gan,max_original,max_gan
water_depth (m),124.428571,115.830762,22.307377,19.684184,99.0,99.0,107.5,99.0,116.0,106.0,142.5,132.0,156.0,155.0
weight (t),18815.714286,19248.246144,5472.264453,1969.571299,10054.0,13126.0,16530.5,17940.0,18584.0,19310.0,21182.0,20602.0,27647.0,25769.0
installation_date,38.857143,38.519785,5.639993,4.799385,31.0,31.0,34.5,34.0,41.0,39.0,42.5,43.0,46.0,45.0
type_of_production (1 oil and gas; 2 oil; 3 gas),1.571429,1.373798,0.786796,0.701552,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,3.0,3.0
number_of_legs,7.428571,7.0,1.511858,0.0,4.0,7.0,8.0,7.0,8.0,7.0,8.0,7.0,8.0,7.0
number_of_piles,17.142857,9.755421,10.106575,8.075427,5.0,5.0,8.5,5.0,20.0,5.0,23.0,11.0,32.0,31.0
height_of_jacket_or_sub-structure (m),137.5,120.780478,18.99342,12.060419,114.0,114.0,122.75,114.137,136.0,115.0,150.5,120.373,166.0,165.304
distance_to_coast (km),218.571429,233.438185,66.271807,54.338256,120.0,120.0,180.0,194.0,240.0,260.0,264.0,279.0,282.0,281.0
risk_to_other_users-complete,0.428571,0.700648,0.534522,0.458025,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
risk_to_other_users-partial,7e-06,4e-06,1e-05,8e-06,0.0,0.0,0.0,6.991625e-11,7.8e-08,1.450694e-08,1.4e-05,2.885956e-06,2.3e-05,2.29991e-05


In [16]:
describe_diferenca = (df_original.describe() - df_gan.describe()).transpose()
describe_diferenca.drop(columns='count', inplace=True)
describe_diferenca

Unnamed: 0,mean,std,min,25%,50%,75%,max
water_depth (m),2.176871,-0.3113349,0.0,7.5,3.0,-5.5,1.0
weight (t),-346.9856,2477.364,-379.0,-790.75,-800.0,20.5,589.0
installation_date,0.3127429,0.2688375,0.0,1.5,2.0,-1.5,1.0
type_of_production (1 oil and gas; 2 oil; 3 gas),-0.02637143,-0.03326665,0.0,0.0,0.0,0.0,0.0
number_of_legs,0.7391714,0.6833675,0.0,1.0,1.0,1.0,0.0
number_of_piles,3.535557,0.2790755,0.0,3.5,11.0,0.0,1.0
height_of_jacket_or_sub-structure (m),9.729692,2.948054,0.0,7.95725,16.2035,12.51125,0.055
distance_to_coast (km),-11.05137,8.8206,0.0,2.0,-20.0,-16.0,1.0
risk_to_other_users-complete,-0.03352857,0.03593603,0.0,0.0,0.0,0.0,0.0
risk_to_other_users-partial,1.195818e-07,1.463294e-07,0.0,-2.537206e-09,-4.813135e-07,-4e-06,0.0


#### Analisando a semelhança dos dataframe com base no KSComplement e CorrelationSimilarity

In [17]:
from sdmetrics.reports.single_table import QualityReport
from sdv.metadata import SingleTableMetadata

In [18]:
df_original.head()

Unnamed: 0,water_depth (m),weight (t),installation_date,type_of_production (1 oil and gas; 2 oil; 3 gas),number_of_legs,number_of_piles,height_of_jacket_or_sub-structure (m),distance_to_coast (km),risk_to_other_users-complete,risk_to_other_users-partial,risk_to_personnel-complete,risk_to_personnel-partial,energy_consumption-complete (GJ),energy_consumption-partial (GJ),emissions-complete (t),emissions-partial (t),impacts_of_option-complete,technical_feasibility_or_challenge-complete,technical_feasibility_or_challenge-partial,commercial_impact_on_fisheries-complete,commercial_impact_on_fisheries-partial,wider_community_impact-complete,wider_community_impact-partial,total_removal_cost-complete,total_removal_cost-partial
0,99,22000,36,1,8,8,114.0,269,1,0.0,0.29,0.12,1110100,817000,87000,58500,0.0,0.0,1.0,0.79,0.78,0.5,0.5,1.0,0.56
1,112,20364,41,1,8,9,123.0,259,1,0.0,0.32,0.16,1180500,895500,92000,64000,0.0,0.0,1.0,0.79,0.78,0.5,0.5,1.0,0.56
2,141,15561,46,2,8,26,147.0,120,0,2.3e-05,0.025,0.01,297654,530148,24277,31064,0.66,0.25,1.0,1.0,0.94,1.0,1.0,0.53,1.0
3,156,27647,44,2,8,32,166.0,240,0,1.5e-05,0.04,0.02,487750,570818,40416,45266,1.0,0.5,1.0,1.0,0.66,1.0,1.0,1.0,0.57
4,103,18584,33,1,8,20,122.5,230,0,7.8e-08,0.09,0.06,733082,511765,59588,41170,0.53,0.39,0.15,0.0,0.0,0.0,0.0,1.0,0.73


In [19]:
df_gan_limpo.head()

Unnamed: 0,water_depth (m),weight (t),installation_date,type_of_production (1 oil and gas; 2 oil; 3 gas),number_of_legs,number_of_piles,height_of_jacket_or_sub-structure (m),distance_to_coast (km),risk_to_other_users-complete,risk_to_other_users-partial,risk_to_personnel-complete,risk_to_personnel-partial,energy_consumption-complete (GJ),energy_consumption-partial (GJ),emissions-complete (t),emissions-partial (t),impacts_of_option-complete,technical_feasibility_or_challenge-complete,technical_feasibility_or_challenge-partial,commercial_impact_on_fisheries-complete,commercial_impact_on_fisheries-partial,wider_community_impact-complete,wider_community_impact-partial,total_removal_cost-complete,total_removal_cost-partial
3,99,21864,33,1,7,5,116.924,280,1,4.168242e-09,0.196,0.106,1162515,712150,56720,51859,0.011,0.0,0.999,0.893,0.912,0.147,0.571,1.0,0.441
4,154,22160,45,3,7,29,153.872,205,0,2.288615e-05,0.025,0.014,326299,442949,24348,34025,0.943,0.499,0.999,0.98,0.797,0.983,0.764,0.983,0.71
5,112,20741,38,1,7,19,147.868,280,0,1.040589e-05,0.025,0.011,422675,433864,24346,31383,0.977,0.497,0.902,0.889,0.433,0.854,0.379,1.0,0.497
13,111,17979,39,1,7,5,115.238,194,1,1.852431e-07,0.279,0.152,1022266,848812,46171,50131,0.006,0.076,0.98,0.045,0.118,0.048,0.091,1.0,0.603
14,124,21217,37,1,7,5,119.267,252,1,3.888132e-07,0.159,0.104,991284,728409,41057,49594,0.042,0.005,0.999,0.903,0.844,0.515,0.473,0.999,0.522


In [20]:
# Supondo que A e B são seus DataFrames reais e sintéticos
A = df_original.copy()
B = df_gan.copy()

# Criar o metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=A)
metadata_dict = metadata.to_dict()

# Gerar o relatório de qualidade
report = QualityReport()
report.generate(real_data=A, synthetic_data=B, metadata=metadata_dict)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 25/25 [00:00<00:00, 546.24it/s]|
Column Shapes Score: 70.23%

(2/2) Evaluating Column Pair Trends: |██████████| 300/300 [00:00<00:00, 364.18it/s]|
Column Pair Trends Score: 92.93%

Overall Score (Average): 81.58%



In [21]:
report.get_score()

0.8158169829059161

In [22]:
report.get_properties()

Unnamed: 0,Property,Score
0,Column Shapes,0.702295
1,Column Pair Trends,0.929339


In [23]:
report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Score
0,installation_date,KSComplement,0.857143
1,type_of_production (1 oil and gas; 2 oil; 3 gas),KSComplement,0.926857
2,number_of_legs,KSComplement,0.159957
3,number_of_piles,KSComplement,0.674257
4,height_of_jacket_or_sub-structure (m),KSComplement,0.586157
5,risk_to_other_users-complete,KSComplement,0.966471
6,risk_to_other_users-partial,KSComplement,0.614229
7,risk_to_personnel-complete,KSComplement,0.693957
8,risk_to_personnel-partial,KSComplement,0.750957
9,impacts_of_option-complete,KSComplement,0.617171


In [24]:
report.get_details(property_name='Column Pair Trends')

Unnamed: 0,Column 1,Column 2,Metric,Score,Real Correlation,Synthetic Correlation
0,installation_date,type_of_production (1 oil and gas; 2 oil; 3 gas),CorrelationSimilarity,0.997904,0.622398,0.626591
1,installation_date,number_of_legs,CorrelationSimilarity,0.872327,0.614305,0.358959
2,installation_date,number_of_piles,CorrelationSimilarity,0.976564,0.681692,0.728563
3,installation_date,height_of_jacket_or_sub-structure (m),CorrelationSimilarity,0.923974,0.599779,0.447727
4,installation_date,risk_to_other_users-complete,CorrelationSimilarity,0.991762,-0.473868,-0.457392
5,installation_date,risk_to_other_users-partial,CorrelationSimilarity,0.960311,0.828725,0.749348
6,installation_date,risk_to_personnel-complete,CorrelationSimilarity,0.957019,-0.277817,-0.191854
7,installation_date,risk_to_personnel-partial,CorrelationSimilarity,0.925998,-0.274356,-0.126353
8,installation_date,impacts_of_option-complete,CorrelationSimilarity,0.87205,0.481318,0.225418
9,installation_date,technical_feasibility_or_challenge-complete,CorrelationSimilarity,0.979833,0.397458,0.357124
