
# Diamond price prediction

This notebook contains the solution to the Kaggle competition, made by:
Novosad Ivan 232,
Suvorova Aleksandra 232,
Rodioniva Anna 232


In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
import sys
import math
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

from typing import List, Tuple
from dataclasses import dataclass

from sklearn.base            import BaseEstimator, TransformerMixin, clone
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline
from sklearn.impute          import SimpleImputer
from sklearn.preprocessing   import OneHotEncoder, StandardScaler
from sklearn.linear_model    import Ridge, SGDRegressor, RidgeCV
from sklearn.metrics         import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, GridSearchCV

import warnings
warnings.filterwarnings("ignore")
RANDOM_STATE = 42

os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

pd.set_option('display.max_rows', 15, 'display.max_columns', 500, 'display.max_colwidth', 1, 'display.precision', 2)
np.set_printoptions(linewidth=10000, precision=4, edgeitems=20, suppress=True)
plt.rcParams['figure.figsize'] = [16, 6]


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle config set -n competition -v your-first-kaggle-competition-diamonds

!kaggle competitions download >> log

!unzip -o *.zip >> log

!kaggle competitions leaderboard --show

- competition is now set to: your-first-kaggle-competition-diamonds
Using competition: your-first-kaggle-competition-diamonds
  teamId  teamName                submissionDate              score       
--------  ----------------------  --------------------------  ----------  
14353838  Humarin                 2025-09-11 00:02:35.840000  663.49600   
14354286  backend team            2025-09-10 19:21:20.500000  664.20039   
14353833  Lapochki(2+Sasha)       2025-09-12 18:50:59.986000  668.71053   
14354273  velilyna                2025-09-10 20:15:10.080000  729.44949   
14353443  polinalesovina          2025-09-11 08:04:44.483000  761.94252   
14353734  t.kvlnko                2025-09-10 19:18:47.233000  982.84376   
14302985  Baseline.csv            2025-08-26 12:49:30.486000  1070.14434  
14353583  Saraa ali               2025-09-08 13:06:58.200000  1070.14434  
14353837  –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ü–æ–ª–∏—â—É–∫       2025-09-10 18:34:28.070000  1070.14434  
14353506  Majid Sohrabi          

In [5]:
df = pd.read_csv("XY_diamonds.csv")
df.info()
print()
df.head()
print()
df['cut'].unique()
df['color'].unique()
df['clarity'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   carat    200000 non-null  float64
 1   depth    200000 non-null  float64
 2   table    200000 non-null  float64
 3   x        200000 non-null  float64
 4   y        200000 non-null  float64
 5   z        200000 non-null  float64
 6   cut      200000 non-null  object 
 7   color    200000 non-null  object 
 8   clarity  200000 non-null  object 
 9   price    160000 non-null  float64
dtypes: float64(7), object(3)
memory usage: 15.3+ MB



Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity,price
0,0.35,67.2,57.1,4.64,4.69,2.87,I,G,VS1,
1,1.64,67.3,60.7,7.84,7.82,4.94,V,E,SI1,
2,0.25,67.3,49.5,4.0,4.04,2.49,I,F,VVS1,
3,0.28,67.9,60.3,4.3,4.26,2.66,P,E,VS2,
4,0.34,67.5,69.4,5.44,5.5,3.32,P,E,VS2,





array(['I', 'V', 'P', 'F', 'G'], dtype=object)

array(['G', 'E', 'F', 'H', 'I', 'J', 'D'], dtype=object)

array(['VS1', 'SI1', 'VVS1', 'VS2', 'SI2', 'VVS2', 'IF', 'I1'], dtype=object)

In [6]:
has_price = df["price"].notna()
print("Labeled rows:", has_price.sum(), "/", len(df))

train = df.loc[has_price].copy()
test  = df.loc[~has_price].copy()

Labeled rows: 160000 / 200000


## Feature engineering

In [7]:
cut_map = {'F':1,'G':2,'V':3,'P':4,'I':5}
color_map = {'J':1,'I':2,'H':3,'G':4,'F':5,'E':6,'D':7}
clarity_map = {'I1':1,'SI2':2,'SI1':3,'VS2':4,'VS1':5,'VVS2':6,'VVS1':7,'IF':8}

def safe_div(a, b):
    a_arr = np.asarray(a, dtype='float64')
    b_arr = np.asarray(b, dtype='float64')
    cond = (b_arr != 0) & (~np.isnan(b_arr))
    return np.where(cond, a_arr / b_arr, np.nan)

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    for c in ['carat','depth','table','x','y','z']:
        if c in X.columns:
            X[c] = pd.to_numeric(X[c], errors='coerce')

    for c in ['x','y','z']:
        if c in X.columns:
            X[f'{c}_is_zero'] = (X[c] == 0).astype(int)
            X.loc[X[c] == 0, c] = np.nan

    X['volume'] = X[['x','y','z']].prod(axis=1, skipna=True)
    X['area_xy'] = X['x'] * X['y']
    X['size_sum']  = X[['x','y','z']].sum(axis=1, skipna=True)
    X['size_mean'] = X[['x','y','z']].mean(axis=1, skipna=True)

    X['depth_pct'] = 100.0 * safe_div(X['z'], (X['x'] + X['y'])/2.0)
    X['xy_ratio']  = safe_div(X['x'], X['y'])
    X['z_to_xy']   = safe_div(X['z'], np.sqrt(X['x'] * X['y']))
    X['carat_per_vol'] = safe_div(X['carat'], X['volume'])

    X['deviation_depth_pct_from_ideal'] = (X['depth_pct'] - 61.5).abs()
    X['deviation_table_from_ideal'] = (X['table'] - 57.0).abs()

    X['carat_sq'] = X['carat']**2
    X['log_carat'] = np.log1p(X['carat'])
    X['depth_dev_sq'] = (X['depth_pct'] - 61.5)**2
    X['table_dev_sq'] = (X['table'] - 57.0)**2

    if 'cut' in X.columns:
        X['cut_score'] = X['cut'].map(cut_map).astype(float)
        X['carat_x_cut'] = X['carat'] * X['cut_score']
    if 'color' in X.columns:
        X['color_score'] = X['color'].map(color_map).astype(float)
        X['carat_x_color'] = X['carat'] * X['color_score']
    if 'clarity' in X.columns:
        X['clarity_score'] = X['clarity'].map(clarity_map).astype(float)
        X['carat_x_clarity'] = X['carat'] * X['clarity_score']

    if all(c in X.columns for c in ['cut','color']):
        X['cut_color'] = X['cut'].astype(str) + "_" + X['color'].astype(str)
    if all(c in X.columns for c in ['cut','clarity']):
        X['cut_clarity'] = X['cut'].astype(str) + "_" + X['clarity'].astype(str)

    for c in ['carat','x','y','z','volume','area_xy','size_sum','size_mean',
              'deviation_depth_pct_from_ideal','deviation_table_from_ideal']:
        X[f'log1p_{c}'] = np.log1p(X[c])

    return X

In [8]:
def compute_clip_bounds(df: pd.DataFrame, numeric_cols, low=0.005, high=0.995):
    bounds = {}
    for c in numeric_cols:
        if c in df.columns:
            lo, hi = df[c].quantile(low), df[c].quantile(high)
            if not np.isfinite(lo):
                lo = df[c].min()
            if not np.isfinite(hi):
                hi = df[c].max()
            bounds[c] = (lo, hi)
    return bounds

def apply_clip_bounds(df: pd.DataFrame, bounds: dict):
    X = df.copy()
    for c, (lo, hi) in bounds.items():
        if c in X.columns:
            X[c] = X[c].clip(lo, hi)
    return X

In [9]:
train_fe = feature_engineering(train)
test_fe  = feature_engineering(test)

cat_cols = [c for c in ['cut','color','clarity','cut_color','cut_clarity'] if c in train_fe.columns]
num_cols = [c for c in train_fe.columns if c not in cat_cols + ['price']]

bounds = compute_clip_bounds(train_fe, numeric_cols=num_cols, low=0.005, high=0.995)
train_fe[num_cols] = apply_clip_bounds(train_fe[num_cols], bounds)
test_fe[num_cols]  = apply_clip_bounds(test_fe[num_cols], bounds)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder(handle_unknown='ignore'))
        ]), cat_cols),
    ]
)

In [10]:
def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def get_model():
    return Ridge(alpha=0.001, random_state=RANDOM_STATE)

def cross_validated_oof_predictions(X, y, preprocessor, model, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    oof = np.zeros(len(y), dtype=float)
    models = []
    mae_scores = []

    fold = 0
    for train_idx, val_idx in kf.split(X, y):
        fold += 1
        X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

        pipe = Pipeline(steps=[
            ('prep', preprocessor),
            ('model', clone(model))
        ])

        y_tr_log = np.log1p(y_tr.values)
        pipe.fit(X_tr, y_tr_log)

        pred_log = pipe.predict(X_va)
        pred = np.expm1(pred_log)
        oof[val_idx] = pred

        score = mae(y_va.values, pred)
        mae_scores.append(score)
        models.append(pipe)
        print(f"Fold {fold}: MAE={score:.4f}")

    print(f"CV MAE mean={np.mean(mae_scores):.4f}  std={np.std(mae_scores):.4f}")
    return oof, models, mae_scores

In [11]:
y = train_fe['price']
X = train_fe.drop(columns=['price'])

model = get_model()

oof, models, mae_scores = cross_validated_oof_predictions(X, y, preprocessor, model, n_splits=5)

print("\nOOF MAE:")
print(f"  Ridge: {np.mean(mae_scores):.4f} ¬± {np.std(mae_scores):.4f}")

oof_mae = mae(y.values, oof)
print(f"\nFinal OOF MAE (Ridge, alpha=0.001): {oof_mae:.4f}")

Fold 1: MAE=668.6544
Fold 2: MAE=669.0861
Fold 3: MAE=666.7714
Fold 4: MAE=666.2127
Fold 5: MAE=674.6066
CV MAE mean=669.0662  std=2.9757

OOF MAE:
  Ridge: 669.0662 ¬± 2.9757

Final OOF MAE (Ridge, alpha=0.001): 669.0662


In [12]:
final_model = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', clone(model))
])

y_log = np.log1p(y.values)
final_model.fit(X, y_log)

print("Final Ridge model (alpha=0.001) refit on full training data.")

Final Ridge model (alpha=0.001) refit on full training data.


In [13]:
pred_log = final_model.predict(test_fe)
test_pred = np.expm1(pred_log)

test_pred = np.maximum(test_pred, 0.0)

print("Test predictions complete. Shape:", test_pred.shape)

Test predictions complete. Shape: (40000,)


In [14]:
id_col = 'id' if 'id' in test.columns else None
if id_col:
    ids = test[id_col].astype(int).values
else:
    ids = np.arange(1, len(test) + 1, dtype=int)

submission = pd.DataFrame({
    'id': ids,
    'price': test_pred
})

submission['price'] = submission['price'].clip(lower=10).round(2)

submission_file = "submission.csv"
submission.to_csv(submission_file, index=False)

!kaggle competitions submit \
    -c your-first-kaggle-competition-diamonds \
    -f {submission_file} \
    -m "Submission"

display(submission.head())

100% 527k/527k [00:00<00:00, 1.50MB/s]
Successfully submitted to Your first Kaggle Competition - üíéDiamonds

Unnamed: 0,id,price
0,1,796.73
1,2,15032.73
2,3,875.02
3,4,598.47
4,5,890.78
