In [42]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import phik
import seaborn as sns
import tensorflow as tf
import keras

from ctgan import CTGAN
from sklearn.linear_model import RANSACRegressor
from sklearn.ensemble import IsolationForest
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids

from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras import layers
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.models import load_model


In [43]:
df = pd.read_csv("../ex1/regression/cars_optimized.csv")
target = "price_levy_combined"
df.drop("Unnamed: 0",axis=1, inplace=True)

In [44]:
# save original mean, std and variance of target variable to compare in the future
target_description = {
    "mean": [df.price_levy_combined.mean()],
    "std": [df.price_levy_combined.std()],
    "var": [df.price_levy_combined.var()],
}
target_description = pd.DataFrame(target_description)
target_description

Unnamed: 0,mean,std,var
0,18249.399472,13357.233521,178415700.0


In [45]:
# Save original correlations and phik matrix of target variable to compare in the future
original_correlations = df.corr()["price_levy_combined"]
original_phik_matrix = df.phik_matrix()["price_levy_combined"]

interval columns not set, guessing: ['Prod. year', 'Leather interior', 'Mileage', 'Cylinders', 'Wheel', 'price_levy_combined', 'Airbags_bins', 'Engine volume int', 'Turbo', 'Coupe', 'Goods wagon', 'Hatchback', 'Jeep', 'Microbus', 'Minivan', 'Pickup', 'Sedan', 'Universal', 'Diesel', 'Hybrid', 'LPG', 'Petrol', 'Automatic', 'Manual', 'Variator', '4x4', 'Front', 'Germany', 'Japan', 'South Korea', 'USA']


In [46]:
original_distribution_fig = px.histogram(df, x="price_levy_combined", title="Original distribution")
original_distribution_fig.show()

My plan for fixing distribution of this dataset is : 

1. Generate more data with CTGAN - done

2. Use undersampling to reduce amount of cheap cars under 5k - done 

3. Try TomekLinks for overlap managment - too aggresive and took away more than half of the data (or the data is that bad?)

4. Run RANSAC for noise - done

5. Cutoff more outliers from right side of distribution - done



In [47]:
def comparison_plot(original_df, optimized_df, target="price_levy_combined"):
    """
    Plots a comparison of two datasets target variable distribution 
    """
    # Create histograms for both datasets
    histogram1 = go.Histogram(
        x=original_df[target],
        name='Before',
        bingroup=1
    )
    histogram2 = go.Histogram(
        x=optimized_df[target],
        name='After',
        # Opacity so we can see original dataset
        opacity=0.5,
        bingroup=1
    )

    # Create the layout to ensure both histograms appear on the same axis
    layout = go.Layout(
        title='Comparison of datasets before operation and after',
        barmode='overlay',
        xaxis=dict(title='Value'),
        yaxis=dict(title='Frequency'),
        hovermode="x unified"
    )

    # Combine the histograms and layout into a figure
    fig = go.Figure(data=[histogram1, histogram2], layout=layout)
    # Show the figure
    fig.show()

<h1>RANSAC noise managment</h1>

In [48]:
# X/y split for further algorithms
X = df.drop(target, axis=1)
y = df[target]

In [49]:
# initalize RANSAC regerssor and fit data
# RANSAC needs a threshold in order to process data
ransac = RANSACRegressor(residual_threshold=np.std(y) * 1.5)
ransac.fit(X, y)

inlier_mask = ransac.inlier_mask_
# noise
outliers = df[~inlier_mask]
# normal data
inliers = df[inlier_mask]

print(f"Inliers: {len(inliers)}")
print(f"Outliers: {len(outliers)}")

Inliers: 11442
Outliers: 1818


In [50]:
comparison_plot(df, inliers)

Seems like it has founded quite a lot of noise across whole dataset, but especially very cheap and very expensive cars took damage

In [51]:
df = inliers

<h1>Even more outliers managment using IsolationForest and IQR removal</h1>

In [52]:
# Initialize isolation forest
iso = IsolationForest(contamination=0.1) 

# fit isolation forest
y_pred = iso.fit_predict(df)

# filter outliers only
outliers = df[y_pred != 1]
df_inliners = df.drop(outliers.index)

In [53]:
# Calculate Q1 and Q3, adjust for a more subtle outlier detection
# Preserve most part of the data and focus on right side
Q1 = df_inliners[target].quantile(0.1)
Q3 = df_inliners[target].quantile(0.85)

# Calculate IQR
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = df_inliners[(df_inliners[target] < lower_bound) | (df_inliners[target] > upper_bound)]
df_inliners = df_inliners.drop(outliers.index)

In [54]:
comparison_plot(df, df_inliners)

<h1>ClusterCentroids Undersampling</h1>

In [55]:
# Cluster centroids originally is made for classification
# Create temporary binned target variablöe
df_inliners["target_binned"] = pd.cut(df_inliners[target], bins=15, labels=False)

fig = px.histogram(df_inliners, x="target_binned")
fig.show()

In [57]:
variables = df_inliners.drop('target_binned', axis=1).columns
X = df_inliners[variables]
y = df_inliners['target_binned']

# initialize the undersampling sampler
# with auto, we actually lost 95% of the data, so let's use a specified sampling strategy
strategy = {0: 200, 1: 400, 2: 800, 3: 1000, 4: 1587, 5: 900, 6: 590, 7: 300, 8: 199, 9: 200, 10: 150, 11: 100}
# sampler = RandomUnderSampler(sampling_strategy=strategy, random_state=321)
sampler = ClusterCentroids(sampling_strategy=strategy)

# perform undersampling
X_resampled, y_resampled = sampler.fit_resample(X, y)

# reconstruct the dataframe
df_resampled = pd.DataFrame(X_resampled, columns=variables)
df_resampled['target_binned'] = y_resampled

comparison_plot(df_inliners, df_resampled)


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.


The ClusterCentroids or classes from which it inherits use `_get_tags` and `_more_tags`. Please define the `__sklearn_tags__` method, or inherit from `sklearn.base.BaseEstimator` and/or other appropriate mixins such as `sklearn.base.TransformerMixin`, `sklearn.base.ClassifierMixin`, `sklearn.base.RegressorMixin`, and `sklearn.base.OutlierMixin`. From scikit-learn 1.7, not defining `__sklearn_tags__` will raise an error.



<h1>Generate more data with CTGAN</h1>

In [58]:
# Initialize CTGAN
ctgan = CTGAN()

def synthesize_data(subset, size, epochs=100):
    """"
    Fits subset of DF to CTGAN, returns a synthesized dataframe of needed size
    """
    ctgan.fit(subset, epochs=epochs)
    synthetic_df = ctgan.sample(size)
    return synthetic_df

In [63]:
# Create subset
subset_df = df_resampled[(df_resampled[target] > 9000) & (df_resampled[target] < 23000)]
# Generate data
synthetic_df = synthesize_data(subset_df, 3000)
# Combine with original data
synthetic_df = pd.concat([df_resampled, synthetic_df])

# plot the difference
comparison_plot(df_resampled, synthetic_df)

<h1>Conclusion</h1>
Dataset have been heavily modified, let's see how badly correalation and phix matrices changed

In [64]:
target_description.loc[len(target_description)] = [synthetic_df.price_levy_combined.mean(), 
                                                   synthetic_df.price_levy_combined.std(), 
                                                   synthetic_df.price_levy_combined.var()]

In [65]:
target_description

Unnamed: 0,mean,std,var
0,18249.399472,13357.233521,178415700.0
1,18680.000104,9372.813134,87849630.0
2,19135.358589,9328.402446,87019090.0


In [66]:
# Load correlations and phik data of original dataset and synthesied verison into df
# and calculate the difference
correlations = pd.DataFrame(original_correlations)
correlations.rename({"price_levy_combined": "original correlations"}, axis=1, inplace=True)
correlations["synthetic correlations"] = synthetic_df.corr()["price_levy_combined"]
correlations['Correaltions Difference (%)'] = round(((correlations['synthetic correlations'] - correlations['original correlations']) / correlations['original correlations'].abs()) * 100, 2)

correlations["original phik"] = original_phik_matrix
correlations["synthetic phik"] = synthetic_df.phik_matrix()["price_levy_combined"]
correlations['Phik Difference (%)'] = round(((correlations['synthetic phik'] - correlations['original phik']) / correlations['original phik'].abs()) * 100, 2)

correlations


interval columns not set, guessing: ['Prod. year', 'Leather interior', 'Mileage', 'Cylinders', 'Wheel', 'price_levy_combined', 'Airbags_bins', 'Engine volume int', 'Turbo', 'Coupe', 'Goods wagon', 'Hatchback', 'Jeep', 'Microbus', 'Minivan', 'Pickup', 'Sedan', 'Universal', 'Diesel', 'Hybrid', 'LPG', 'Petrol', 'Automatic', 'Manual', 'Variator', '4x4', 'Front', 'Germany', 'Japan', 'South Korea', 'USA', 'target_binned']


Unnamed: 0,original correlations,synthetic correlations,Correaltions Difference (%),original phik,synthetic phik,Phik Difference (%)
Prod. year,0.408716,0.413791,1.24,0.591691,0.526332,-11.05
Leather interior,-0.192215,-0.117491,38.87,0.378959,0.240631,-36.5
Mileage,-0.244021,-0.214809,11.97,0.341389,0.258291,-24.34
Cylinders,-0.050366,-0.108647,-115.72,0.152367,0.280748,84.26
Wheel,-0.171246,-0.091554,46.54,0.318122,0.161711,-49.17
price_levy_combined,1.0,1.0,0.0,1.0,1.0,0.0
Airbags_bins,-0.052559,-0.012721,75.8,0.212926,0.210731,-1.03
Engine volume int,0.053003,0.037792,-28.7,0.260554,0.212123,-18.59
Turbo,0.082161,-0.021539,-126.22,0.202708,0.169308,-16.48
Coupe,0.000515,0.013591,2538.55,0.041832,0.063776,52.46


For me, seems like it is overoptimized. Correlations with target variable changed drastically in some rare cases (1000% or 3000%). The most usual and important (according to f-score) variables, such as mileage changed up to 12%. And in general, changes are around 30-70%.

Undersampling of cheap cars might be a mistake. Thinking afterwards, common sense tells me that by changing distribution in that way I might actually create unrealistic data. In real world, most of the cars are cheap, but trained on this data model would assume, that they are rare.

Also tried to test train it with exact same network as unoptimized data before, but I got r-squared 0.45 vs previous 0.68. However, the problem might be in network structure, because changes in data are quite heavy.