In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin # For custom transformer
import joblib # For saving and loading the pipeline

In [4]:
try:
    import tensorflow as tf
    from tensorflow.keras.applications import ResNet50
    from tensorflow.keras.preprocessing import image
    from tensorflow.keras.models import Model
    from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess_input

    print("SUCCESS: TensorFlow and Keras imported for image processing. Proceeding with actual implementation.")
except ImportError:
    print("WARNING: TensorFlow/Keras not found. Image processing part of the pipeline will remain conceptual.")
    tf = None # Explicitly set tf to None if import fails


SUCCESS: TensorFlow and Keras imported for image processing. Proceeding with actual implementation.


In [5]:
df_hybrid = pd.read_csv('diamond_with_image_path.csv')

In [6]:
df_hybrid.head()

Unnamed: 0,Id,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Messurements,Price,Data Url,id_str,image_path
0,1524552,CUSHION,0.52,VS1,S-T,EX,EX,VG,M,4.55-4.38×2.97,718.92,https://capitalwholesalediamonds.com/product/0...,1524552,Dataset/image\1524552.jpg
1,1651023,CUSHION,0.5,SI1,M,EX,EX,VG,F,4.86-4.23×2.87,725.32,https://capitalwholesalediamonds.com/product/0...,1651023,Dataset/image\1651023.jpg
2,1632749,CUSHION,0.5,VS2,M,VG,EX,GD,N,4.51-4.47×2.98,771.13,https://capitalwholesalediamonds.com/product/0...,1632749,Dataset/image\1632749.jpg
3,1660174,CUSHION,0.5,VVS2,M,EX,EX,VG,F,4.94-4.27×2.88,772.88,https://capitalwholesalediamonds.com/product/0...,1660174,Dataset/image\1660174.jpg
4,1602072,CUSHION,0.59,VS2,U-V,VG,EX,GD,N,4.64-4.57×3.19,807.91,https://capitalwholesalediamonds.com/product/0...,1602072,Dataset/image\1602072.jpg


In [7]:
cols_to_drop = ['Id', 'Data Url', 'id_str']
existing_cols = [col for col in cols_to_drop if col in df_hybrid.columns]

df_hybrid_cleaned = df_hybrid.copy()

if existing_cols:
    df_hybrid_cleaned.drop(columns=existing_cols, inplace=True)
    print(f"Dropped columns: {existing_cols}")
else:
    print("No columns to drop.")


Dropped columns: ['Id', 'Data Url', 'id_str']


In [8]:
df_hybrid_cleaned.head()

Unnamed: 0,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Messurements,Price,image_path
0,CUSHION,0.52,VS1,S-T,EX,EX,VG,M,4.55-4.38×2.97,718.92,Dataset/image\1524552.jpg
1,CUSHION,0.5,SI1,M,EX,EX,VG,F,4.86-4.23×2.87,725.32,Dataset/image\1651023.jpg
2,CUSHION,0.5,VS2,M,VG,EX,GD,N,4.51-4.47×2.98,771.13,Dataset/image\1632749.jpg
3,CUSHION,0.5,VVS2,M,EX,EX,VG,F,4.94-4.27×2.88,772.88,Dataset/image\1660174.jpg
4,CUSHION,0.59,VS2,U-V,VG,EX,GD,N,4.64-4.57×3.19,807.91,Dataset/image\1602072.jpg


In [9]:
import re

print("--- Parsing 'Messurements' Column ---")

def parse_measurements(measurement_str):
    """
    Parses a string like 'x.xx x y.yy x z.zz' into a tuple of floats (x, y, z).
    Returns (np.nan, np.nan, np.nan) if parsing fails.
    """
    if pd.isna(measurement_str) or not isinstance(measurement_str, str):
        return np.nan, np.nan, np.nan
    try:
        # Use regex to find numbers that look like floats, separated by ' x '
        # This regex looks for a sequence of digits, optionally followed by a dot and more digits
        parts = re.findall(r'(\d+\.?\d*)', measurement_str)
        if len(parts) == 3:
            return float(parts[0]), float(parts[1]), float(parts[2])
        elif len(parts) > 0 and len(parts) < 3: # Handle cases with fewer than 3 dimensions if they exist
            # For simplicity, if only one or two dimensions, fill with NaN for missing ones
            return tuple([float(p) for p in parts] + [np.nan] * (3 - len(parts)))
        else:
            return np.nan, np.nan, np.nan # No numbers found or unexpected format
    except (ValueError, TypeError):
        return np.nan, np.nan, np.nan # Catch errors during conversion

# Apply the parsing function
# df_tabular_only_strategy[['x', 'y', 'z']] = df_tabular_only_strategy['Messurements'].apply(
#     lambda s: pd.Series(parse_measurements(s))
# )
# Correction based on previous memory: All columns start with a capital letter.
# So, we should create 'X', 'Y', 'Z'
df_hybrid_cleaned[['X', 'Y', 'Z']] = df_hybrid_cleaned['Messurements'].apply(
    lambda s: pd.Series(parse_measurements(s))
)
df_hybrid_cleaned.drop(columns=['Messurements'], inplace=True)



--- Parsing 'Messurements' Column ---


In [10]:
#df_hybrid_cleaned.head()

df_hybrid_parsed = df_hybrid_cleaned.copy()

df_hybrid_parsed.head()

Unnamed: 0,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Price,image_path,X,Y,Z
0,CUSHION,0.52,VS1,S-T,EX,EX,VG,M,718.92,Dataset/image\1524552.jpg,4.55,4.38,2.97
1,CUSHION,0.5,SI1,M,EX,EX,VG,F,725.32,Dataset/image\1651023.jpg,4.86,4.23,2.87
2,CUSHION,0.5,VS2,M,VG,EX,GD,N,771.13,Dataset/image\1632749.jpg,4.51,4.47,2.98
3,CUSHION,0.5,VVS2,M,EX,EX,VG,F,772.88,Dataset/image\1660174.jpg,4.94,4.27,2.88
4,CUSHION,0.59,VS2,U-V,VG,EX,GD,N,807.91,Dataset/image\1602072.jpg,4.64,4.57,3.19


In [11]:
df_hybrid.head()

Unnamed: 0,Id,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Messurements,Price,Data Url,id_str,image_path
0,1524552,CUSHION,0.52,VS1,S-T,EX,EX,VG,M,4.55-4.38×2.97,718.92,https://capitalwholesalediamonds.com/product/0...,1524552,Dataset/image\1524552.jpg
1,1651023,CUSHION,0.5,SI1,M,EX,EX,VG,F,4.86-4.23×2.87,725.32,https://capitalwholesalediamonds.com/product/0...,1651023,Dataset/image\1651023.jpg
2,1632749,CUSHION,0.5,VS2,M,VG,EX,GD,N,4.51-4.47×2.98,771.13,https://capitalwholesalediamonds.com/product/0...,1632749,Dataset/image\1632749.jpg
3,1660174,CUSHION,0.5,VVS2,M,EX,EX,VG,F,4.94-4.27×2.88,772.88,https://capitalwholesalediamonds.com/product/0...,1660174,Dataset/image\1660174.jpg
4,1602072,CUSHION,0.59,VS2,U-V,VG,EX,GD,N,4.64-4.57×3.19,807.91,https://capitalwholesalediamonds.com/product/0...,1602072,Dataset/image\1602072.jpg


In [12]:
# Handle Colour_IsFancy and pre-clean Colour
fancy_colors = ['FANCY', 'FBG'] # Assume these exist in your data for this logic
df_hybrid_parsed['Colour_IsFancy'] = df_hybrid_parsed['Colour'].isin(fancy_colors).astype(int)
df_hybrid_parsed['Colour'] = df_hybrid_parsed['Colour'].replace('Q', 'Q-R')
df_hybrid_parsed['Colour'] = df_hybrid_parsed['Colour'].replace('O', 'O-P')

non_fancy_colours_series = df_hybrid_parsed[~df_hybrid_parsed['Colour'].isin(fancy_colors)]['Colour']
if not non_fancy_colours_series.empty:
    mode_non_fancy = non_fancy_colours_series.mode()[0]
    df_hybrid_parsed['Colour'] = df_hybrid_parsed['Colour'].replace(fancy_colors, mode_non_fancy)


In [13]:
df_hybrid_parsed['Colour'].value_counts()

Colour
F      727
E      637
D      588
I      465
G      462
J      402
H      367
K      333
L      184
M       84
N       57
U-V     15
O-P     11
Q-R     10
W-X      8
S-T      7
Y-Z      2
Name: count, dtype: int64

In [14]:
df_hybrid = df_hybrid_parsed.copy()

In [15]:
df_hybrid.head()

Unnamed: 0,Shape,Weight,Clarity,Colour,Cut,Polish,Symmetry,Fluorescence,Price,image_path,X,Y,Z,Colour_IsFancy
0,CUSHION,0.52,VS1,S-T,EX,EX,VG,M,718.92,Dataset/image\1524552.jpg,4.55,4.38,2.97,0
1,CUSHION,0.5,SI1,M,EX,EX,VG,F,725.32,Dataset/image\1651023.jpg,4.86,4.23,2.87,0
2,CUSHION,0.5,VS2,M,VG,EX,GD,N,771.13,Dataset/image\1632749.jpg,4.51,4.47,2.98,0
3,CUSHION,0.5,VVS2,M,EX,EX,VG,F,772.88,Dataset/image\1660174.jpg,4.94,4.27,2.88,0
4,CUSHION,0.59,VS2,U-V,VG,EX,GD,N,807.91,Dataset/image\1602072.jpg,4.64,4.57,3.19,0


In [16]:
print("\n--- Separating Target Variable 'Price' ---")
y = df_hybrid['Price']
X = df_hybrid.drop(columns=['Price'])

print("Shape of X (features after initial preprocessing):", X.shape)
print("Shape of y (target):", y.shape)


--- Separating Target Variable 'Price' ---
Shape of X (features after initial preprocessing): (4359, 13)
Shape of y (target): (4359,)


In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4359 entries, 0 to 4358
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Shape           4359 non-null   object 
 1   Weight          4359 non-null   float64
 2   Clarity         4359 non-null   object 
 3   Colour          4359 non-null   object 
 4   Cut             4359 non-null   object 
 5   Polish          4359 non-null   object 
 6   Symmetry        4359 non-null   object 
 7   Fluorescence    4359 non-null   object 
 8   image_path      4359 non-null   object 
 9   X               4359 non-null   float64
 10  Y               4359 non-null   float64
 11  Z               4359 non-null   float64
 12  Colour_IsFancy  4359 non-null   int64  
dtypes: float64(4), int64(1), object(8)
memory usage: 442.8+ KB


In [18]:
# --- Define Feature Lists ---
numerical_features = ['Weight', 'X', 'Y', 'Z']
ordinal_features = ['Cut', 'Polish', 'Symmetry', 'Clarity', 'Colour']
nominal_features = ['Fluorescence', 'Shape', 'Colour_IsFancy'] # Colour_IsFancy is already binary, will be handled by OneHotEncoder as 0/1

# New: The column containing image paths
image_path_feature = ['image_path']

In [24]:
# --- Custom Image Feature Extractor Transformer (Functional Version) ---
class ImageFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, target_size=(224, 224), model_name='ResNet50'):
        self.target_size = target_size
        self.model_name = model_name
        self.model = None
        self.preprocess_input_fn = None

        # --- MODIFIED PART: Check if tf is not None instead of _tf_available ---
        if tf is not None:
            print(f"ImageFeatureExtractor: Initializing with {model_name}.")
            if model_name == 'ResNet50':
                base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
                self.model = Model(inputs=base_model.input, outputs=base_model.output)
                self.preprocess_input_fn = resnet_preprocess_input
                print(f"ImageFeatureExtractor: Loaded {model_name} for feature extraction.")
            else:
                raise ValueError(f"Model '{model_name}' not supported or not implemented yet.")
        else:
            print("ImageFeatureExtractor: TensorFlow not available. This extractor will return dummy features.")

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        image_paths = X.iloc[:, 0].tolist()

        features = []
        # --- MODIFIED PART: Check self.model and self.preprocess_input_fn directly ---
        if self.model and self.preprocess_input_fn: # Only run if TensorFlow model was loaded successfully in __init__
            for path in image_paths:
                try:
                    # --- YOU NEED TO ENSURE THESE LINES ARE CORRECT FOR YOUR IMAGE PATHS ---
                    img = image.load_img(path, target_size=self.target_size)
                    img_array = image.img_to_array(img)
                    img_array = np.expand_dims(img_array, axis=0)
                    img_array = self.preprocess_input_fn(img_array) # Use the stored preprocessing function

                    cnn_features = self.model.predict(img_array, verbose=0)[0]
                    features.append(cnn_features)

                except Exception as e:
                    print(f"Error processing image {path}: {e}. Returning zeros for this image.")
                    features.append(np.zeros(2048))
        else:
            print("ImageFeatureExtractor: Returning dummy features (model not loaded).")
            for _ in image_paths:
                features.append(np.zeros(2048))

        return np.array(features)

    def get_feature_names_out(self, input_features=None):
        output_feature_dim = 2048
        if self.model and hasattr(self.model, 'output_shape') and len(self.model.output_shape) > 1:
             output_feature_dim = self.model.output_shape[-1]
        return [f"img_feature_{i}" for i in range(output_feature_dim)]
# --- Define Preprocessing Transformers ---

# 3.1 Numerical Transformer: Impute with median, then Scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 3.2 Ordinal Categorical Transformer: Impute with most frequent, then Ordinal Encode
# These orders must match the EXACT string categories in your DataFrame
quality_order_common = ['F', 'GD', 'VG', 'EX']
clarity_order = ['I2', 'I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
colour_order = [
    'Y-Z', 'W-X', 'U-V', 'S-T', 'Q-R', 'O-P',
    'N', 'M', 'L', 'K', 'J', 'I', 'H', 'G', 'F', 'E', 'D'
]

ordinal_encoder_categories = [
    quality_order_common, # For 'Cut'
    quality_order_common, # For 'Polish'
    quality_order_common, # For 'Symmetry'
    clarity_order,        # For 'Clarity'
    colour_order          # For 'Colour'
]

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=ordinal_encoder_categories,
                                       handle_unknown='use_encoded_value',
                                       unknown_value=-1))
])

# 3.3 Nominal Categorical Transformer: Impute with most frequent, then One-Hot Encode
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 3.4 Image Transformer
image_transformer = Pipeline(steps=[
    # Note: ImageFeatureExtractor doesn't need an imputer because it handles missing/errors internally
    ('image_extractor', ImageFeatureExtractor())
])

ImageFeatureExtractor: Initializing with ResNet50.
ImageFeatureExtractor: Loaded ResNet50 for feature extraction.


In [25]:
# --- Create the Multi-Modal ColumnTransformer ---
print("\n--- Creating Multi-Modal ColumnTransformer ---")
multi_modal_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features),
        ('img', image_transformer, image_path_feature) # New: Process the image path feature
    ],
    remainder='drop' # Explicitly drop any columns not specified
)


--- Creating Multi-Modal ColumnTransformer ---


In [26]:
# --- Create the Full Multi-Modal Preprocessing Pipeline ---
full_multi_modal_pipeline = Pipeline(steps=[('preprocessor', multi_modal_preprocessor)])

print("\n--- Full Multi-Modal Preprocessing Pipeline Created ---")
print(full_multi_modal_pipeline)



--- Full Multi-Modal Preprocessing Pipeline Created ---
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Weight', 'X', 'Y', 'Z']),
                                                 ('ord',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ordinal_encoder',
                                                                   OrdinalEncoder(categories=[['F',


In [27]:
# --- Fit and Transform Data (Conceptual for Image part) ---
print("\n--- Fitting and Transforming Data with Multi-Modal Pipeline ---")

# Fit the pipeline on your features (X) and then transform them
df_transformed_array = full_multi_modal_pipeline.fit_transform(X)


# --- Get Transformed Column Names and Create Final DataFrame ---
print("\n--- Generating Transformed Multi-Modal DataFrame ---")

transformed_column_names = []

# Numerical features (scaled)
transformed_column_names.extend(numerical_features)

# Ordinal features (encoded)
transformed_column_names.extend(ordinal_features)

# Nominal features (one-hot encoded)
onehot_feature_names = full_multi_modal_pipeline.named_steps['preprocessor'].named_transformers_['nom']['onehot_encoder'].get_feature_names_out(nominal_features)
transformed_column_names.extend(onehot_feature_names)

# Image features
image_feature_names = full_multi_modal_pipeline.named_steps['preprocessor'].named_transformers_['img']['image_extractor'].get_feature_names_out(image_path_feature)
transformed_column_names.extend(image_feature_names)


--- Fitting and Transforming Data with Multi-Modal Pipeline ---
ImageFeatureExtractor: Initializing with ResNet50.
ImageFeatureExtractor: Loaded ResNet50 for feature extraction.

--- Generating Transformed Multi-Modal DataFrame ---


In [28]:
# Create a new DataFrame from the transformed array
df_preprocessed_X_multi_modal = pd.DataFrame(df_transformed_array, columns=transformed_column_names, index=X.index)

print("Shape of original features (X):", X.shape)
print("Shape of transformed multi-modal features (df_preprocessed_X_multi_modal):", df_preprocessed_X_multi_modal.shape)
print("\nSample of transformed multi-modal features DataFrame:")
print(df_preprocessed_X_multi_modal.head())


Shape of original features (X): (4359, 13)
Shape of transformed multi-modal features (df_preprocessed_X_multi_modal): (4359, 2074)

Sample of transformed multi-modal features DataFrame:
     Weight         X         Y         Z  Cut  Polish  Symmetry  Clarity  \
0  0.080126 -0.653726 -0.274237  0.277462  3.0     3.0       2.0      5.0   
1  0.003313 -0.413845 -0.479759  0.051294  3.0     3.0       2.0      3.0   
2  0.003313 -0.684679 -0.150924  0.300079  2.0     3.0       1.0      4.0   
3  0.003313 -0.351940 -0.424953  0.073911  3.0     3.0       2.0      6.0   
4  0.348972 -0.584083 -0.013909  0.775033  2.0     3.0       1.0      4.0   

   Colour  Fluorescence_F  ...  img_feature_2038  img_feature_2039  \
0     3.0             0.0  ...          0.326659          1.504248   
1     7.0             1.0  ...          0.034146          0.140811   
2     7.0             0.0  ...          0.640757          0.074334   
3     7.0             1.0  ...          0.014959          0.215275   
4

In [30]:
# --- Save the Pipeline and Processed Data ---
print("\n--- Saving Multi-Modal Pipeline and Processed Data ---")

pipeline_filename_multi_modal = 'full_multi_modal_preprocessing_pipeline.joblib'
joblib.dump(full_multi_modal_pipeline, pipeline_filename_multi_modal)
print(f"Multi-modal preprocessing pipeline saved to: {pipeline_filename_multi_modal}")

processed_X_filename_multi_modal = 'processed_diamond_features_X_multi_modal.csv'
df_preprocessed_X_multi_modal.to_csv(processed_X_filename_multi_modal, index=False)
print(f"Processed multi-modal features (X) saved to: {processed_X_filename_multi_modal}")

# Target variable (y) was already saved in previous steps, no need to resave unless changed.
y_filename = 'diamond_target_y_multi_model.csv'
y.to_csv(y_filename, index=False)
print(f"Target variable (y) saved to: {y_filename}")

print("\nFull multi-modal preprocessing pipeline setup complete (conceptual image part).")
print("REMINDER: The 'ImageFeatureExtractor' requires a proper local Python environment with TensorFlow/Keras and Pillow to load and process actual images.")


--- Saving Multi-Modal Pipeline and Processed Data ---
Multi-modal preprocessing pipeline saved to: full_multi_modal_preprocessing_pipeline.joblib
Processed multi-modal features (X) saved to: processed_diamond_features_X_multi_modal.csv
Target variable (y) saved to: diamond_target_y_multi_model.csv

Full multi-modal preprocessing pipeline setup complete (conceptual image part).
REMINDER: The 'ImageFeatureExtractor' requires a proper local Python environment with TensorFlow/Keras and Pillow to load and process actual images.


In [1]:
import sklearn
print("Scikit-learn version:", sklearn.__version__)

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import numpy
print("NumPy version:", numpy.__version__)

import pandas
print("Pandas version:", pandas.__version__)

import xgboost
print("XGBoost version:", xgboost.__version__)

Scikit-learn version: 1.6.1




TensorFlow version: 2.19.0
NumPy version: 2.2.5
Pandas version: 2.2.3
XGBoost version: 3.0.2
