In [24]:
import os
import numpy as np
import pandas as pd
from scipy.stats import chi2

In [25]:
def load_data(path, compression=None):
    return pd.read_csv(path, compression=compression)

In [26]:
def clean_data(dataframe: pd.DataFrame):
    dataframe = dataframe.drop_duplicates().reset_index(drop=True)
    dataframe = dataframe.dropna().reset_index(drop=True)
    return dataframe


In [27]:
def compute_mahalanobis_distances(data, reference_data):
    covariance = np.cov(reference_data, rowvar=False)
    covariance_pm1 = np.linalg.matrix_power(covariance, -1)
    center_point = np.mean(reference_data, axis=0)

    distances = [(val - center_point).T.dot(covariance_pm1).dot(val - center_point)
                 for val in data]
    return np.array(distances)

In [28]:
def remove_outliers_using_reference(data, reference_data, dataframe):
    distances = compute_mahalanobis_distances(data, reference_data)
    cutoff = chi2.ppf(0.80, reference_data.shape[1])
    outliers_indexes = np.where(distances > cutoff)
    data_without_outliers = np.delete(data, outliers_indexes, axis=0)

    return pd.DataFrame(data_without_outliers, columns=dataframe.columns)

In [29]:
folder = "gretel_77_s1"
current_dir = os.getcwd()
data_path = os.path.join(current_dir, "../data", "data.csv")

In [30]:
df = load_data(data_path)
df

Unnamed: 0,%Chi,%Gel,%Gly,%Pec,%Sta,%Oil,%W,%AA,T(°C),%RH,t(h),TS,WVP,%E
0,0.90,0.00,0.38,0.0,0.45,0.0,97.89,0.38,60.000000,50.000000,48.0,49.400000,92.200000,100.40
1,0.90,0.45,0.38,0.0,0.45,0.0,97.45,0.37,60.000000,50.000000,48.0,13.630000,90.600000,70.60
2,0.90,0.00,0.38,0.0,1.34,0.0,97.01,0.37,60.000000,50.000000,48.0,33.360000,101.200000,50.30
3,0.89,0.45,0.37,0.0,1.34,0.0,96.58,0.37,60.000000,50.000000,48.0,40.410000,94.700000,35.10
4,0.90,0.00,0.19,0.0,0.90,0.0,97.63,0.38,60.000000,50.000000,48.0,39.230000,70.300000,8.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0.00,5.00,0.00,0.0,0.00,0.0,95.00,0.00,25.000000,50.000000,24.0,31.930000,44.580000,72.24
106,2.00,0.00,0.00,0.0,0.00,0.0,97.00,1.00,30.861554,52.836699,10.0,99.333222,39.080969,2.70
107,2.00,0.00,0.24,0.0,0.00,0.0,96.76,1.00,34.880140,52.502307,10.0,69.774008,47.873295,39.50
108,2.00,0.00,0.44,0.0,0.00,0.0,96.56,1.00,35.217398,52.547206,10.0,65.868019,48.086374,44.20


In [31]:

df = clean_data(df)
df

Unnamed: 0,%Chi,%Gel,%Gly,%Pec,%Sta,%Oil,%W,%AA,T(°C),%RH,t(h),TS,WVP,%E
0,0.90,0.00,0.38,0.0,0.45,0.0,97.89,0.38,60.000000,50.000000,48.0,49.400000,92.200000,100.40
1,0.90,0.45,0.38,0.0,0.45,0.0,97.45,0.37,60.000000,50.000000,48.0,13.630000,90.600000,70.60
2,0.90,0.00,0.38,0.0,1.34,0.0,97.01,0.37,60.000000,50.000000,48.0,33.360000,101.200000,50.30
3,0.89,0.45,0.37,0.0,1.34,0.0,96.58,0.37,60.000000,50.000000,48.0,40.410000,94.700000,35.10
4,0.90,0.00,0.19,0.0,0.90,0.0,97.63,0.38,60.000000,50.000000,48.0,39.230000,70.300000,8.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,0.00,5.00,0.00,0.0,0.00,0.0,95.00,0.00,25.000000,50.000000,24.0,31.930000,44.580000,72.24
103,2.00,0.00,0.00,0.0,0.00,0.0,97.00,1.00,30.861554,52.836699,10.0,99.333222,39.080969,2.70
104,2.00,0.00,0.24,0.0,0.00,0.0,96.76,1.00,34.880140,52.502307,10.0,69.774008,47.873295,39.50
105,2.00,0.00,0.44,0.0,0.00,0.0,96.56,1.00,35.217398,52.547206,10.0,65.868019,48.086374,44.20


In [32]:
synthetic_data_path = os.path.join(
    current_dir, f"../data/{folder}", f"{folder}.csv.gz")
synthetic_df = load_data(synthetic_data_path, compression="gzip")
synthetic_df.shape

(5000, 14)

In [33]:
synthetic_df = clean_data(synthetic_df)
synthetic_df.shape

(2443, 14)

In [34]:
clean_synthetic_df = remove_outliers_using_reference(synthetic_df.values, df.to_numpy(), synthetic_df)
clean_synthetic_df

Unnamed: 0,%Chi,%Gel,%Gly,%Pec,%Sta,%Oil,%W,%AA,T(°C),%RH,t(h),TS,WVP,%E
0,1.00,0.00,0.00,0.0,0.00,1.0,97.00,0.50,21.00,50.00,24.00,44.40,40.00,30.20
1,1.00,0.00,0.00,0.0,0.00,0.0,98.00,0.50,21.00,50.00,24.00,36.30,37.82,42.80
2,1.00,0.00,0.20,0.0,0.00,3.0,95.30,0.50,21.00,57.00,48.00,19.30,46.83,36.70
3,1.00,0.00,0.20,0.0,0.00,2.0,96.30,0.50,21.00,57.00,24.00,72.20,44.56,24.50
4,1.00,0.00,0.00,0.0,0.00,1.0,97.00,1.00,25.00,50.00,48.00,61.82,37.82,4.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1966,0.89,0.45,0.37,0.0,1.34,0.0,96.58,0.37,60.00,50.00,48.00,57.40,94.70,35.10
1967,0.90,0.22,0.19,0.0,1.34,0.0,96.98,0.37,60.00,50.00,48.00,38.89,101.70,4.51
1968,0.90,0.22,0.54,0.0,0.45,0.0,97.63,0.37,60.00,50.00,48.00,16.71,65.80,100.20
1969,2.00,0.00,0.00,0.0,0.00,2.0,95.50,1.00,20.32,45.66,72.00,14.41,0.54,5.46


In [35]:
clean_synthetic_df.to_csv(os.path.join(current_dir, f"../data/{folder}", "synthetic_gretel.csv"), index=False)