In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn .compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [3]:
train = pd.read_csv('diamonds_train.csv')
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
test = pd.read_csv('diamonds_test.csv')
test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.73,Ideal,G,VVS2,61.9,55.0,5.83,5.77,3.59
1,0.61,Premium,F,VVS2,59.7,58.0,5.56,5.53,3.31
2,1.55,Premium,I,VS1,58.2,60.0,7.69,7.59,4.45
3,0.46,Good,F,IF,56.2,61.0,5.16,5.24,2.92
4,1.1,Very Good,F,VS2,60.6,58.0,6.67,6.77,4.07


In [5]:
x_train = train.drop(['price'], axis=1)
y_train = train['price']

x_test = test

In [6]:
categorical_cols = ['cut', 'color', 'clarity']
numerical_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [8]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
x_train_cat = encoder.fit_transform(x_train[categorical_cols])
x_test_cat = encoder.transform(x_test[categorical_cols])

In [9]:
scaler = StandardScaler()
x_train_num = scaler.fit_transform(x_train[numerical_cols])
x_test_num = scaler.transform(x_test[numerical_cols])

In [10]:
x_train_final = np.hstack((x_train_num, x_train_cat))
x_test_final = np.hstack((x_test_num, x_test_cat))

In [11]:
model = LinearRegression()
model.fit(x_train_final, y_train)

In [12]:
y_pred = model.predict(x_test_final)
y_pred = np.maximum(y_pred, 0)
r2 = r2_score(y_test, y_pred)
print(f"r2_score: {r2:.4f}")

r2_score: 0.9300


In [13]:
submission = pd.DataFrame({'PredictedPrice': y_pred})
submission.to_csv('submission.csv', index=False)
print(submission.head())

   PredictedPrice
0        0.000000
1     5107.907481
2     2915.357466
3     6122.633842
4        0.000000
