In [22]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from math import sqrt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge



# Data Loading and Initial Inspection.

In [23]:
# Load the data 
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")


In [24]:

# displaying the number of rows and number of columns 
print(train_data.shape)
print(test_data.shape)


(43152, 11)
(10788, 10)


In [25]:
# displaying statistical information 
train_data.describe()


Unnamed: 0,Id,carat,depth,table,price,x,y,z
count,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0,43152.0
mean,21576.5,0.797855,61.747177,57.458347,3929.491912,5.731568,5.735018,3.538568
std,12457.053745,0.473594,1.435454,2.233904,3985.527795,1.121279,1.148809,0.708238
min,1.0,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,10788.75,0.4,61.0,56.0,947.75,4.71,4.72,2.91
50%,21576.5,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,32364.25,1.04,62.5,59.0,5312.0,6.54,6.54,4.04
max,43152.0,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [26]:
#checking for null values
train_data.isnull().sum()

Id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [None]:
# checking for duplicated rows
train_data.duplicated().sum()

In [27]:
# Displaying the data types of each column  
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43152 entries, 0 to 43151
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Id       43152 non-null  int64  
 1   carat    43152 non-null  float64
 2   cut      43152 non-null  object 
 3   color    43152 non-null  object 
 4   clarity  43152 non-null  object 
 5   depth    43152 non-null  float64
 6   table    43152 non-null  float64
 7   price    43152 non-null  int64  
 8   x        43152 non-null  float64
 9   y        43152 non-null  float64
 10  z        43152 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 3.6+ MB


## Data Preprocessing

In [28]:
# x , y , z are the length , width , height for the diamonds ,  it doesn't make any sense , so probably it's a misinput Data
print(train_data[train_data["x"]==0])
print(train_data[train_data["y"]==0])
print(train_data[train_data["z"]==0])

# so i will drop them 
train_data = train_data.drop(train_data[train_data["x"]==0].index)
train_data = train_data.drop(train_data[train_data["y"]==0].index)
train_data = train_data.drop(train_data[train_data["z"]==0].index)


          Id  carat        cut color clarity  depth  table  price    x     y  \
1221    1222   1.14       Fair     G     VS1   57.5   67.0   6381  0.0  0.00   
6765    6766   0.71       Good     F     SI2   64.1   60.0   2130  0.0  0.00   
22171  22172   1.00  Very Good     H     VS2   63.3   53.0   5139  0.0  0.00   
24901  24902   2.25    Premium     H     SI2   62.8   59.0  18034  0.0  0.00   
39201  39202   1.07      Ideal     F     SI2   61.6   56.0   4954  0.0  6.62   

         z  
1221   0.0  
6765   0.0  
22171  0.0  
24901  0.0  
39201  0.0  
          Id  carat        cut color clarity  depth  table  price    x    y  \
1221    1222   1.14       Fair     G     VS1   57.5   67.0   6381  0.0  0.0   
6765    6766   0.71       Good     F     SI2   64.1   60.0   2130  0.0  0.0   
22171  22172   1.00  Very Good     H     VS2   63.3   53.0   5139  0.0  0.0   
24901  24902   2.25    Premium     H     SI2   62.8   59.0  18034  0.0  0.0   

         z  
1221   0.0  
6765   0.0  
22171 

In [29]:
train_data.shape

(43135, 11)

In [30]:
# Calculate Z-Score for each numerical column , (observation - mean)/ Standard deviation 
z_scores = (train_data.select_dtypes(include=np.number) - train_data.select_dtypes(include=np.number).mean()) / train_data.select_dtypes(include=np.number).std()

# Identify outliers (threshold of 3 for demonstration)
outliers = (z_scores > 3) | (z_scores < -3)

# Display outliers
print("Outliers:")
print(train_data[outliers.any(axis=1)])

Outliers:
          Id  carat        cut color clarity  depth  table  price     x     y  \
23        24   1.70      Ideal     G    VVS1   61.0   56.0  18279  7.62  7.67   
62        63   2.04    Premium     I     SI1   62.2   57.0  16942  8.14  8.07   
120      121   2.10       Fair     F     SI2   59.5   69.0  16506  8.35  8.25   
128      129   2.03      Ideal     F     SI2   61.4   58.0  18535  8.12  8.16   
156      157   0.70       Good     H     VS2   56.7   63.0   2187  5.87  5.92   
...      ...    ...        ...   ...     ...    ...    ...    ...   ...   ...   
43075  43076   2.09    Premium     F     SI2   61.7   59.0  17840  8.21  8.23   
43093  43094   1.09       Good     G     SI2   57.4   61.0   3424  6.82  6.75   
43099  43100   2.22  Very Good     I     SI2   63.2   57.0  16547  8.28  8.23   
43111  43112   1.75    Premium     F     VS2   61.4   58.0  17017  7.72  7.76   
43138  43139   2.26    Premium     I     SI2   62.0   58.0  16241  8.40  8.37   

          z  
23 

In [31]:
# Create a new DataFrame with no outliers
data_no_outliers = train_data[~outliers.any(axis=1)]


In [32]:
# drop the id column 
train_data.drop(columns=["Id"], inplace=True)


In [33]:
# rechecking the info
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43135 entries, 0 to 43151
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    43135 non-null  float64
 1   cut      43135 non-null  object 
 2   color    43135 non-null  object 
 3   clarity  43135 non-null  object 
 4   depth    43135 non-null  float64
 5   table    43135 non-null  float64
 6   price    43135 non-null  int64  
 7   x        43135 non-null  float64
 8   y        43135 non-null  float64
 9   z        43135 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.6+ MB


In [34]:
test_data

Unnamed: 0,Id,carat,cut,color,clarity,depth,table,x,y,z
0,1,0.34,Ideal,G,VVS2,61.1,57.0,4.52,4.48,2.75
1,2,0.71,Premium,E,VS2,62.7,58.0,5.74,5.68,3.58
2,3,0.44,Very Good,I,VS1,62.8,56.0,4.83,4.88,3.05
3,4,0.81,Premium,E,SI2,60.1,59.0,6.09,6.03,3.65
4,5,0.40,Ideal,G,VVS1,61.2,56.0,4.74,4.80,2.92
...,...,...,...,...,...,...,...,...,...,...
10783,10784,0.57,Ideal,H,VS1,60.9,56.0,5.34,5.36,3.26
10784,10785,1.05,Ideal,G,VS2,60.8,57.0,6.65,6.58,4.02
10785,10786,0.71,Ideal,E,VVS1,62.3,55.0,5.68,5.72,3.55
10786,10787,1.11,Premium,E,SI2,61.0,60.0,6.68,6.66,4.07


In [35]:

# Apply label encoder to each column with categorical data
lb = LabelEncoder()

# Define order for encoding , adding weights 
color_weights = {'D':7,'E':6,'F':5,'G':4, 'H':3, 'I':2,'J':1}
cut_weights = {'Ideal':5, 'Premium':4,'Very Good':3, 'Good':2, 'Fair':1}
clarity_weights = {'IF': 8,'VVS1':7,'VVS2':6, 'VS1':5, 'VS2':4 , 'SI1':3,'SI2':2,'I1':1}



# Label Encoding
label_encoder = LabelEncoder()

# Apply label encoding with weights
data_no_outliers['cut'] = data_no_outliers['cut'].map(cut_weights)
data_no_outliers['color'] = data_no_outliers['color'].map(color_weights)
data_no_outliers['clarity'] = data_no_outliers['clarity'].map(clarity_weights)

test_data['cut'] = test_data['cut'].map(cut_weights)
test_data['color'] = test_data['color'].map(color_weights)
test_data['clarity'] = test_data['clarity'].map(clarity_weights)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_no_outliers['cut'] = data_no_outliers['cut'].map(cut_weights)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_no_outliers['color'] = data_no_outliers['color'].map(color_weights)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_no_outliers['clarity'] = data_no_outliers['clarity'].map(cla

In [36]:
print(test_data)

          Id  carat  cut  color  clarity  depth  table     x     y     z
0          1   0.34    5      4        6   61.1   57.0  4.52  4.48  2.75
1          2   0.71    4      6        4   62.7   58.0  5.74  5.68  3.58
2          3   0.44    3      2        5   62.8   56.0  4.83  4.88  3.05
3          4   0.81    4      6        2   60.1   59.0  6.09  6.03  3.65
4          5   0.40    5      4        7   61.2   56.0  4.74  4.80  2.92
...      ...    ...  ...    ...      ...    ...    ...   ...   ...   ...
10783  10784   0.57    5      3        5   60.9   56.0  5.34  5.36  3.26
10784  10785   1.05    5      4        4   60.8   57.0  6.65  6.58  4.02
10785  10786   0.71    5      6        7   62.3   55.0  5.68  5.72  3.55
10786  10787   1.11    4      6        2   61.0   60.0  6.68  6.66  4.07
10787  10788   2.00    4      5        2   61.5   59.0  8.03  7.95  4.92

[10788 rows x 10 columns]


In [None]:
print(train_data)

In [37]:
data_no_outliers
data_no_outliers["cut"].unique()

array([5, 4, 3, 1, 2])

## MODELS

In [38]:
# specifying X and y 
X= data_no_outliers.drop(columns=["price"])
y= data_no_outliers["price"]


In [39]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [40]:
# Train the model
model = LinearRegression()
model.fit(X_train,y_train)

In [41]:
# predict 
y_pred = model.predict(X_test)


In [43]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error :",np.sqrt(mse) )
print("R^2 Score:", r2)


Root Mean Squared Error : 991.8669887244645
R^2 Score: 0.9173816131014771


In [46]:
#since the linear regression didnt give me good performance , i will try other models

# Initialize models
models = {
    "RandomForestRegressor": RandomForestRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "SVR": SVR(),
    "Lasso": Lasso(),
    "Ridge": Ridge()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R^2 Score: {r2}")
    print()

    # Check if RMSE is less than 500
    """if rmse < 500:
        print(f"{name} meets the criteria of RMSE < 500.")
        break"""


Model: RandomForestRegressor
Root Mean Squared Error: 452.00020148697587
R^2 Score: 0.9828427730742741

Model: GradientBoostingRegressor
Root Mean Squared Error: 496.07930809068307
R^2 Score: 0.9793332558138895

Model: SVR
Root Mean Squared Error: 3681.984083210836
R^2 Score: -0.13850161788924886

Model: Lasso
Root Mean Squared Error: 993.5696868931828
R^2 Score: 0.9170977143075354

Model: Ridge
Root Mean Squared Error: 991.8533879368714
R^2 Score: 0.9173838788638026



  model = cd_fast.enet_coordinate_descent(


In [45]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Define parameter grids for Random Forest and Gradient Boosting
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize models
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)

# Grid search for Random Forest
rf_grid_search = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_

# Grid search for Gradient Boosting
gb_grid_search = GridSearchCV(gb_model, param_grid_gb, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
gb_grid_search.fit(X_train, y_train)
best_gb_model = gb_grid_search.best_estimator_

# Evaluate best models
rf_rmse = sqrt(mean_squared_error(y_test, best_rf_model.predict(X_test)))
gb_rmse = sqrt(mean_squared_error(y_test, best_gb_model.predict(X_test)))

print("Random Forest RMSE:", rf_rmse)
print("Gradient Boosting RMSE:", gb_rmse)




Random Forest RMSE: 446.32820769309836
Gradient Boosting RMSE: 440.0446948879311
