In [62]:
import os
import numpy as np
import pandas as pd
from scipy.stats import chi2

In [63]:
def load_data(path, **kwargs):
    """Load data from a CSV file."""
    return pd.read_csv(path, **kwargs)


In [64]:
def clean_data(df: pd.DataFrame):
    """Clean data by dropping duplicates and NaN values."""
    return df.drop_duplicates().dropna()

In [65]:
def compute_mahalanobis_distances(data, covariance, center_point):
    """Compute the Mahalanobis distances for a dataset."""
    covariance_pm1 = np.linalg.inv(covariance)
    distances = [
        (val - center_point).T.dot(covariance_pm1).dot(val - center_point)
        for val in data
    ]
    return np.array(distances)

In [66]:
def remove_outliers(data, distances, columns):
    """Remove outliers from data based on Mahalanobis distances."""
    cutoff = chi2.ppf(0.90, data.shape[1])
    outliers_indexes = np.where(distances > cutoff)
    data_without_outliers = np.delete(data, outliers_indexes, axis=0)
    return pd.DataFrame(data_without_outliers, columns=columns)

In [67]:
def round_decimals(df, decimals=2):
    """Round all values in the DataFrame to a given number of decimals."""
    return df.round(decimals)

In [68]:
current_dir = os.getcwd()
data_path = os.path.join(current_dir, "../data", "data.csv")

In [69]:
df = load_data(data_path)
df

Unnamed: 0,%Chi,%Gel,%Gly,%Pec,%Sta,%Oil,%W,%AA,T(°C),%RH,t(h),TS,WVP,%E
0,0.90,0.00,0.38,0.0,0.45,0.0,97.89,0.38,60.000000,50.000000,48.0,49.400000,92.200000,100.40
1,0.90,0.45,0.38,0.0,0.45,0.0,97.45,0.37,60.000000,50.000000,48.0,13.630000,90.600000,70.60
2,0.90,0.00,0.38,0.0,1.34,0.0,97.01,0.37,60.000000,50.000000,48.0,33.360000,101.200000,50.30
3,0.89,0.45,0.37,0.0,1.34,0.0,96.58,0.37,60.000000,50.000000,48.0,40.410000,94.700000,35.10
4,0.90,0.00,0.19,0.0,0.90,0.0,97.63,0.38,60.000000,50.000000,48.0,39.230000,70.300000,8.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0.00,5.00,0.00,0.0,0.00,0.0,95.00,0.00,25.000000,50.000000,24.0,31.930000,44.580000,72.24
106,2.00,0.00,0.00,0.0,0.00,0.0,97.00,1.00,30.861554,52.836699,10.0,99.333222,39.080969,2.70
107,2.00,0.00,0.24,0.0,0.00,0.0,96.76,1.00,34.880140,52.502307,10.0,69.774008,47.873295,39.50
108,2.00,0.00,0.44,0.0,0.00,0.0,96.56,1.00,35.217398,52.547206,10.0,65.868019,48.086374,44.20


In [70]:
df = clean_data(df)
df

Unnamed: 0,%Chi,%Gel,%Gly,%Pec,%Sta,%Oil,%W,%AA,T(°C),%RH,t(h),TS,WVP,%E
0,0.90,0.00,0.38,0.0,0.45,0.0,97.89,0.38,60.000000,50.000000,48.0,49.400000,92.200000,100.40
1,0.90,0.45,0.38,0.0,0.45,0.0,97.45,0.37,60.000000,50.000000,48.0,13.630000,90.600000,70.60
2,0.90,0.00,0.38,0.0,1.34,0.0,97.01,0.37,60.000000,50.000000,48.0,33.360000,101.200000,50.30
3,0.89,0.45,0.37,0.0,1.34,0.0,96.58,0.37,60.000000,50.000000,48.0,40.410000,94.700000,35.10
4,0.90,0.00,0.19,0.0,0.90,0.0,97.63,0.38,60.000000,50.000000,48.0,39.230000,70.300000,8.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,0.00,5.00,0.00,0.0,0.00,0.0,95.00,0.00,25.000000,50.000000,24.0,31.930000,44.580000,72.24
106,2.00,0.00,0.00,0.0,0.00,0.0,97.00,1.00,30.861554,52.836699,10.0,99.333222,39.080969,2.70
107,2.00,0.00,0.24,0.0,0.00,0.0,96.76,1.00,34.880140,52.502307,10.0,69.774008,47.873295,39.50
108,2.00,0.00,0.44,0.0,0.00,0.0,96.56,1.00,35.217398,52.547206,10.0,65.868019,48.086374,44.20


In [71]:
covariance = np.cov(df.values, rowvar=False)
center_point = np.mean(df.values, axis=0)
distances = compute_mahalanobis_distances(df.values, covariance, center_point)
clean_df = remove_outliers(df.values, distances, df.columns)
clean_df

Unnamed: 0,%Chi,%Gel,%Gly,%Pec,%Sta,%Oil,%W,%AA,T(°C),%RH,t(h),TS,WVP,%E
0,0.90,0.00,0.38,0.0,0.45,0.0,97.89,0.38,60.000000,50.000000,48.0,49.400000,92.200000,100.40
1,0.90,0.45,0.38,0.0,0.45,0.0,97.45,0.37,60.000000,50.000000,48.0,13.630000,90.600000,70.60
2,0.90,0.00,0.38,0.0,1.34,0.0,97.01,0.37,60.000000,50.000000,48.0,33.360000,101.200000,50.30
3,0.89,0.45,0.37,0.0,1.34,0.0,96.58,0.37,60.000000,50.000000,48.0,40.410000,94.700000,35.10
4,0.90,0.00,0.19,0.0,0.90,0.0,97.63,0.38,60.000000,50.000000,48.0,39.230000,70.300000,8.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,2.50,2.50,3.00,0.0,0.00,0.0,92.00,0.00,25.000000,50.000000,24.0,11.220000,49.400000,84.17
82,2.00,0.00,0.00,0.0,0.00,0.0,97.00,1.00,30.861554,52.836699,10.0,99.333222,39.080969,2.70
83,2.00,0.00,0.24,0.0,0.00,0.0,96.76,1.00,34.880140,52.502307,10.0,69.774008,47.873295,39.50
84,2.00,0.00,0.44,0.0,0.00,0.0,96.56,1.00,35.217398,52.547206,10.0,65.868019,48.086374,44.20


In [72]:
covariance = np.cov(clean_df.values, rowvar=False)
center_point = np.mean(clean_df.values, axis=0)
distances = compute_mahalanobis_distances(clean_df.values, covariance, center_point)
clean_df = remove_outliers(clean_df.values, distances, df.columns)
clean_df

Unnamed: 0,%Chi,%Gel,%Gly,%Pec,%Sta,%Oil,%W,%AA,T(°C),%RH,t(h),TS,WVP,%E
0,0.90,0.00,0.38,0.0,0.45,0.0,97.89,0.38,60.000000,50.000000,48.0,49.400000,92.200000,100.40
1,0.90,0.45,0.38,0.0,0.45,0.0,97.45,0.37,60.000000,50.000000,48.0,13.630000,90.600000,70.60
2,0.90,0.00,0.38,0.0,1.34,0.0,97.01,0.37,60.000000,50.000000,48.0,33.360000,101.200000,50.30
3,0.89,0.45,0.37,0.0,1.34,0.0,96.58,0.37,60.000000,50.000000,48.0,40.410000,94.700000,35.10
4,0.90,0.00,0.19,0.0,0.90,0.0,97.63,0.38,60.000000,50.000000,48.0,39.230000,70.300000,8.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,2.00,0.00,0.00,0.0,0.00,2.0,95.00,1.00,19.893590,46.920180,72.0,19.360000,0.300000,3.76
70,2.00,0.00,0.00,0.0,0.00,0.0,97.00,1.00,30.861554,52.836699,10.0,99.333222,39.080969,2.70
71,2.00,0.00,0.24,0.0,0.00,0.0,96.76,1.00,34.880140,52.502307,10.0,69.774008,47.873295,39.50
72,2.00,0.00,0.44,0.0,0.00,0.0,96.56,1.00,35.217398,52.547206,10.0,65.868019,48.086374,44.20


In [73]:
clean_df = round_decimals(clean_df, 2)

In [74]:
clean_df.to_csv(os.path.join(current_dir, "../data", "gretel_data.csv"), index=False)