In [169]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from scipy.stats import chi2_contingency
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer


sns.set(
    { "figure.figsize": (17, 7) },
    style='ticks',
    # palette=sns.color_palette("Set2"),
    color_codes=True,
    font_scale=0.8
)
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')

# Load data

In [170]:
# Load the dataset
auto = pd.read_csv('adverts.csv')

In [171]:
auto = auto.sample(5000)

In [172]:
class ImportanceImputer():
    def __init__(self):
        pass

    def fit_transform(self, df, impute_feature):
        # Split categorical and numeric features
        categorical_features = []

        for col in list(set(df.columns) - set([impute_feature])):
            if len(df[col].unique()) < 0.3*len(df):
                df[col] = df[col].astype(str)
                categorical_features.append(col)

        numerical_features = list((set(df.columns) - set(categorical_features)) - set([impute_feature]))
        numerical_features = df._get_numeric_data().columns
        categorical_features = list((set(df.columns) - set(numerical_features)) - set([impute_feature]))
        print("numerical features", numerical_features)
        print("categorical features", categorical_features)


        # Use a decision tree classifier to bin numerical features according to the impute_feature
        for feat in numerical_features:
            # Step 1: Train a decision tree to predict colour from price
            paired_df = df.loc[df[impute_feature].notna()]

            X = paired_df[[feat]]  # numeric predictor
            y = paired_df[impute_feature]

            # You can tune max_depth to control how many splits (bins) you get
            tree = DecisionTreeClassifier(max_depth=2, random_state=42)
            tree.fit(X, y)

            # Step 2: Extract threshold cuts from the decision tree
            # Each non-leaf node uses a threshold that splits the data into two groups
            thresholds = []
            def traverse_tree(node=0):
                # Check if node is split node
                if tree.tree_.feature[node] != -2:
                    # -2 indicates a leaf node
                    threshold = tree.tree_.threshold[node]
                    thresholds.append(threshold)
                    traverse_tree(tree.tree_.children_left[node])
                    traverse_tree(tree.tree_.children_right[node])

            traverse_tree()
            thresholds = sorted(thresholds)

            # Step 3: Use these thresholds to bin price
            # For example, if thresholds = [5000, 15000],
            # you can create bins: [-inf, 5000), [5000, 15000), [15000, inf)
            bins = [-np.inf] + thresholds + [np.inf]
            labels = [f'Bin_{i}' for i in range(len(bins)-1)]

            categorical_features.append(feat+'_binned')

            df[feat+'_binned'] = pd.cut(df[feat], bins=bins, labels=labels)


        # Computer Cramer's V between each feature and the impute_feature
        cramer_dict = {}
        for feat in categorical_features:
            contingency_table = pd.crosstab(df[feat], df[impute_feature])

            chi2, p, dof, expected = chi2_contingency(contingency_table)

            # number of rows and columns in the contingency table
            n = contingency_table.sum().sum()
            r, k = contingency_table.shape

            # Cramér's V
            cramer_v = np.sqrt((chi2 / n) / (min(r, k) - 1))
            
            cramer_dict[feat] = cramer_v


        cramer_dict = dict(sorted(cramer_dict.items(), key=lambda x:-x[1]))


        # Order features in descending order of cramer's V
        ordered_features = list(cramer_dict.keys())


        # Impute values where impute_feature is unknown
        # missing_df = df.loc[df[impute_feature].isna()]
        missing_df = df

        def impute_value(row, full_df):
            subset = ordered_features[:]

            while subset:
                condition = True
                for feat in subset:
                    condition = condition & (full_df[feat] == row[feat])
                similar_entries = full_df.loc[condition & full_df[impute_feature].notna()]

                if not similar_entries.empty:
                    return similar_entries[impute_feature].mode()[0]
                
                subset.pop()

            return None

        # Impute colours for the rows
        missing_df[impute_feature + "_imputed"] = missing_df.apply(lambda r: impute_value(r, auto), axis=1)

        # Drop binned feature columns
        for column in numerical_features:
            df.drop(columns=column+"_binned", inplace=True)

        return missing_df

imputer = ImportanceImputer()

result = imputer.fit_transform(auto, "standard_colour")

result

numerical features Index(['public_reference', 'mileage', 'price'], dtype='object')
categorical features ['reg_code', 'year_of_registration', 'standard_model', 'vehicle_condition', 'body_type', 'standard_make', 'fuel_type', 'crossover_car_and_van']


Unnamed: 0,public_reference,mileage,reg_code,standard_colour,standard_make,standard_model,vehicle_condition,year_of_registration,price,body_type,crossover_car_and_van,fuel_type,standard_colour_imputed
378019,202010315658247,37000.0,63,Black,Mercedes-Benz,A Class,USED,2013.0,10995,Hatchback,False,Diesel,Black
193579,202010215268609,40000.0,64,Grey,Renault,Megane,USED,2014.0,5795,Coupe,False,Diesel,Grey
108637,202009234100264,40000.0,17,Grey,Volvo,XC60,USED,2017.0,17695,SUV,False,Diesel,Grey
282372,202010014432673,0.0,,Black,Abarth,595,NEW,,21590,Hatchback,False,Petrol,Black
127675,202009254212369,29082.0,17,Grey,Mercedes-Benz,C Class,USED,2017.0,23480,Convertible,False,Petrol,Grey
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47249,202010315631392,34000.0,55,Silver,Vauxhall,Agila,USED,2005.0,1195,Hatchback,False,Petrol,Silver
336269,202009023201608,13768.0,66,Silver,Peugeot,208,USED,,7300,Hatchback,False,Petrol,Silver
369748,202010305615431,8700.0,67,Black,Vauxhall,Mokka X,USED,2017.0,12990,SUV,False,Petrol,Black
110509,202002227597644,2304.0,69,Red,Vauxhall,Astra,USED,2020.0,14797,Hatchback,False,Petrol,Red


In [173]:
auto.loc[result.index]

Unnamed: 0,public_reference,mileage,reg_code,standard_colour,standard_make,standard_model,vehicle_condition,year_of_registration,price,body_type,crossover_car_and_van,fuel_type,standard_colour_imputed
378019,202010315658247,37000.0,63,Black,Mercedes-Benz,A Class,USED,2013.0,10995,Hatchback,False,Diesel,Black
193579,202010215268609,40000.0,64,Grey,Renault,Megane,USED,2014.0,5795,Coupe,False,Diesel,Grey
108637,202009234100264,40000.0,17,Grey,Volvo,XC60,USED,2017.0,17695,SUV,False,Diesel,Grey
282372,202010014432673,0.0,,Black,Abarth,595,NEW,,21590,Hatchback,False,Petrol,Black
127675,202009254212369,29082.0,17,Grey,Mercedes-Benz,C Class,USED,2017.0,23480,Convertible,False,Petrol,Grey
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47249,202010315631392,34000.0,55,Silver,Vauxhall,Agila,USED,2005.0,1195,Hatchback,False,Petrol,Silver
336269,202009023201608,13768.0,66,Silver,Peugeot,208,USED,,7300,Hatchback,False,Petrol,Silver
369748,202010305615431,8700.0,67,Black,Vauxhall,Mokka X,USED,2017.0,12990,SUV,False,Petrol,Black
110509,202002227597644,2304.0,69,Red,Vauxhall,Astra,USED,2020.0,14797,Hatchback,False,Petrol,Red


In [174]:
# Filter the DataFrame to include rows where standard_colour is not NaN
comparison_df = auto[auto['standard_colour'].notna()]

# Compare the original and imputed columns and calculate accuracy
matches = (comparison_df['standard_colour'] == comparison_df['standard_colour_imputed']).sum()
total = len(comparison_df)
accuracy = matches / total

print(f"Accuracy: {accuracy:.2%} ({matches}/{total} matches)")

Accuracy: 87.53% (4311/4925 matches)


In [175]:
# # Map car colors to line colors
# color_mapping = {
#     "Red": "#F4A6A6",      # Pastel red
#     "Blue": "#A6C8F4",     # Pastel blue
#     "Green": "#A6F4A6",    # Pastel green
#     "White": "#F7F7F7",    # Light gray (near white)
#     "Black": "#000000",    # Light gray for black
#     "Silver": "#CACACA",   # Soft silver
#     "Grey": "#9D9DAD",     # Pastel gray
#     "Orange": "#F7B696",   # Pastel orange
#     'Beige': "#331100",
#     'Yellow': "#FFFF00", 
#     'Gold': "#FFAA00", 
#     'Purple': "#FF00FF", 
#     'Burgundy': "#AA0000", 
#     'Brown': "#441111", 
#     'Bronze': "#441111",
#     'Maroon': "#AA2277",
#     'Multicolour':"#778822",
#     'Pink': "#FFDDDD",
#     'Turquoise': "#BB99FF"
# }

# fig, ax = plt.subplots(figsize=(8,0.5))
# sns.stripplot(auto.loc[auto["price"] < 5e4].sample(500), x="price", jitter=0.3, alpha=0.5,legend=False, ax=ax, hue='standard_colour', dodge=False, palette=color_mapping);