In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gemstone-price-prediction/cubic_zirconia.csv
/kaggle/input/gemstone-price-prediction/Data Dictionary.xlsx


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Correct path for the dataset in a Kaggle notebook
file_path = r'/kaggle/input/gemstone-price-prediction/cubic_zirconia.csv'

try:
    # Load the dataset
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
    print("\n")
except FileNotFoundError:
    print(f"Error: The file was not found at {file_path}. Please check the path and try again.")
    print("Make sure you have added the 'cubic_zirconia.csv' dataset to your notebook.")

Dataset loaded successfully.

First 5 rows of the dataset:
   Unnamed: 0  carat        cut color clarity  depth  table     x     y     z  \
0           1   0.30      Ideal     E     SI1   62.1   58.0  4.27  4.29  2.66   
1           2   0.33    Premium     G      IF   60.8   58.0  4.42  4.46  2.70   
2           3   0.90  Very Good     E    VVS2   62.2   60.0  6.04  6.12  3.78   
3           4   0.42      Ideal     F     VS1   61.6   56.0  4.82  4.80  2.96   
4           5   0.31      Ideal     F    VVS1   60.4   59.0  4.35  4.43  2.65   

   price  
0    499  
1    984  
2   6289  
3   1082  
4    779  




In [3]:
# 2. Handle Missing Values and Categorical Features

# Remove any rows that contain missing values
df_clean = df.dropna()

# Handle Categorical Features using One-Hot Encoding
df_encoded = pd.get_dummies(df_clean, columns=['cut', 'color', 'clarity'], drop_first=True)

# The first column from the original data is an unnamed index column; we'll drop it.
df_encoded = df_encoded.iloc[:, 1:]

print("Columns after cleaning and One-Hot Encoding:")
print(df_encoded.columns)
print("\n")

Columns after cleaning and One-Hot Encoding:
Index(['carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'cut_Good',
       'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_E', 'color_F',
       'color_G', 'color_H', 'color_I', 'color_J', 'clarity_IF', 'clarity_SI1',
       'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1',
       'clarity_VVS2'],
      dtype='object')




In [4]:
# 3. Define Features (X) and Target (y)
y = df_encoded['price']
X = df_encoded.drop('price', axis=1)

print("Shape of Features (X):", X.shape)
print("Shape of Target (y):", y.shape)
print("\n")

Shape of Features (X): (26270, 23)
Shape of Target (y): (26270,)




In [5]:
# 4. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [6]:
# 5. Create and train the Random Forest Regressor model
# n_estimators is the number of decision trees in the forest
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

print("Random Forest Regressor model has been trained.")
print("\n")

Random Forest Regressor model has been trained.




In [7]:
# 6. Make predictions on the test set
predictions = model.predict(X_test)

# 7. Evaluate the model's performance
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Random Forest Regressor Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

Random Forest Regressor Model Evaluation:
Mean Absolute Error (MAE): 320.34
Mean Squared Error (MSE): 457662.00
R-squared (R²): 0.97


In [9]:
feature_names = X.columns.tolist()
import joblib
joblib.dump(model, 'random_forest_model.joblib')
joblib.dump(feature_names, 'feature_names.joblib')

['feature_names.joblib']