In [89]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error

import joblib, warnings
warnings.filterwarnings("ignore")


In [90]:
df = pd.read_csv("/kaggle/input/linear-regression-apu/diamonds_train.csv")
df_test = pd.read_csv("/kaggle/input/linear-regression-apu/diamonds_test.csv")

categorical_cols = ["cut", "color", "clarity"]
numeric_cols = ["carat", "depth", "table", "x", "y", "z"]
target_col = "price"
id_col = "id"

print(df.head())
print(df_test.head())



   carat        cut color clarity  depth  table  price     x     y     z
0   0.51       Good     D     SI2   63.9   55.0   1180  5.04  5.10  3.24
1   0.72      Ideal     E     VS2   60.8   57.0   3091  5.79  5.82  3.53
2   0.70  Very Good     D    VVS2   62.8   60.0   4022  5.65  5.69  3.56
3   0.36      Ideal     D     SI1   61.2   57.0    663  4.59  4.63  2.82
4   0.54  Very Good     D     SI1   60.0   59.8   1593  5.30  5.34  3.18
   id  carat        cut color clarity  depth  table     x     y     z
0   0   1.02       Good     F     SI2   59.2   58.0  6.51  6.56  3.87
1   1   0.70  Very Good     I    VVS1   59.5   58.0  5.78  5.81  3.45
2   2   0.32  Very Good     H    VVS2   63.4   56.0  4.37  4.34  2.76
3   3   0.42      Ideal     F    VVS2   62.2   56.0  4.79  4.82  2.99
4   4   0.40      Ideal     F     VS2   62.3   54.0  4.74  4.77  2.96


In [91]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col + "_encoded"] = le.fit_transform(df[col])          # fit on train
    df_test[col + "_encoded"] = le.transform(df_test[col])    # transform test with SAME mapping
    label_encoders[col] = le


In [92]:


df["carat_sq"] = df["carat"]**2
df_test["carat_sq"] = df_test["carat"]**2


for col in ["x", "y", "z"]:
    bad = df[col] <= 0
    if bad.any():
        df.loc[bad, col] = df[col].median()
    bad_t = df_test[col] <= 0
    if bad_t.any():
        df_test.loc[bad_t, col] = df_test[col].median()

df["volume_xyz"] = df["x"] * df["y"] * df["z"]
df_test["volume_xyz"] = df_test["x"] * df_test["y"] * df_test["z"]


feature_columns = numeric_cols + [c + "_encoded" for c in categorical_cols] + ["carat_sq", "volume_xyz"]

X = df[feature_columns].copy()
y = df[target_col].astype(float).copy()
X_test = df_test[feature_columns].copy()



In [93]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import Ridge

# Build pipeline: scaling + Ridge regression
pipe = Pipeline([
    ("minmax", MinMaxScaler()),
    ("standard", StandardScaler()),
    ("ridge", Ridge(random_state=42))
])

# Small grid of alpha values to try
param_grid = {"ridge__alpha": [5.0, 7.0, 10.0, 12.0, 15.0, 20.0]}

# 5-fold cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

gcv = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",  # RMSE (negative because higher = better)
    cv=cv,
    n_jobs=-1,
    refit=True
)

# Train on training set
gcv.fit(X_train, y_train)

print("Best alpha:", gcv.best_params_["ridge__alpha"])
print("CV RMSE:", round(-gcv.best_score_, 2))

# Evaluate on validation set
val_pred = gcv.predict(X_val)
print("Validation RMSE:", round(np.sqrt(mean_squared_error(y_val, val_pred)), 2))
print("Validation R²:", round(r2_score(y_val, val_pred), 4))



Best alpha: 20.0
CV RMSE: 1366.31
Validation RMSE: 1362.57
Validation R²: 0.8851


In [94]:

X_norm_all   = minmax_scaler.fit_transform(X)
X_scaled_all = standard_scaler.fit_transform(X_norm_all)

final_model = Ridge(alpha=10.0, random_state=42)
final_model.fit(X_scaled_all, y)


joblib.dump(final_model, "final_model.pkl")
joblib.dump(minmax_scaler, "minmax_scaler.pkl")
joblib.dump(standard_scaler, "standard_scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")


X_test_norm   = minmax_scaler.transform(X_test)
X_test_scaled = standard_scaler.transform(X_test_norm)

test_pred = final_model.predict(X_test_scaled)

submission = pd.DataFrame({
    "id": df_test[id_col],
    "price": test_pred
})
submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,id,price
0,0,5751.932664
1,1,2909.7552
2,2,781.178161
3,3,1812.322785
4,4,1264.041809
