In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor

In [2]:
df=pd.read_csv('diamonds.csv')


In [3]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [5]:
df['depth']=df['depth'].clip(50,70)
df['table']=df['table'].clip(50,70)

In [6]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [7]:
df.shape

(53940, 10)

In [8]:
df.duplicated().sum()

np.int64(146)

In [9]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [10]:
df['price'].skew()

np.float64(1.618239898265432)

In [11]:
import numpy as np
df['price_log']=np.log1p(df['price'])

Outliers in target variable → transform, not remove
Log transform is mainly for strong right skew
df[col].skew()>1

“I detected that the price variable was right-skewed with extreme high values.
Since those values are valid business cases, I didn’t remove them.
Instead, I applied a log transformation to stabilize variance and improve model performance

x=y=z=0 is an imposiible case so we remove the rows where x=y=z=0

In [12]:
df=df[(df['x']>0) & (df['x']>0) & (df['x']>0)]

In [13]:
df.shape# the reduced from 53940 to 53787

(53787, 11)

In [14]:
y=df['price_log']
x=df.drop(columns=['price','price_log'])

In [15]:
numeric=x.select_dtypes(include='number').columns.tolist()
categoric=x.select_dtypes(include='object').columns.tolist()

In [16]:
# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
preprocessor=ColumnTransformer(transformers=[("cat",OrdinalEncoder(),categoric),("num",StandardScaler(),numeric)])

In [17]:
#pipeline
model_pipeline=Pipeline(steps=[('preprocessing',preprocessor),('model',KNeighborsRegressor())])

In [18]:
model_pipeline

In [19]:
# Train-test split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42)

In [20]:
model_pipeline.fit(x_train,y_train)

In [21]:
y_pred=model_pipeline.predict(x_test)
print(y_test.shape)
print(y_pred.shape)

(13447,)
(13447,)


In [22]:
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred)
print(score)

0.9798862583857769


In [23]:
# save model
joblib.dump(model_pipeline,"model.pkl")

['model.pkl']

In [24]:
import sklearn
sklearn.__version__

'1.6.1'