In [1]:
#Libraries
import pandas as pd
import numpy as np
import seaborn as sns
#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
#Pre_processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost
from xgboost import XGBRegressor
#Errors
from sklearn.metrics import mean_absolute_error , mean_squared_error , mean_absolute_percentage_error , r2_score
#ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
#Loading the dataset
df = sns.load_dataset('diamonds')
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [3]:
#Separate X and y
X = df.drop('price' , axis=1)
y = df['price']

In [4]:
#Categories
Numeric_features = ['carat','depth','table','x','y','z']
categorical_features = ['cut','color','clarity']


#preprocessing
pre_processing = ColumnTransformer([
    ('num' , StandardScaler() , Numeric_features),
    ('cat' , OneHotEncoder(), categorical_features)
])

In [None]:
#Train , Test and Split data 
X_train , X_test , y_train , y_test = train_test_split(X , y , train_size=0.8 , random_state=42)


#Build the pipeline
pipeline = Pipeline([
    ('processing' , pre_processing),
    ('model' , XGBRegressor())
])

#Training the model
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
#Evaluation
print('MAE:', mean_absolute_error(y_test , y_pred))
print('MSE:', mean_squared_error(y_test , y_pred))
print('RMSE', np.sqrt(mean_squared_error(y_test , y_pred)))
print('MAPE', mean_absolute_percentage_error(y_test , y_pred))
print('r2_score', r2_score(y_test, y_pred))

MAE: 285.6134524615906
MSE: 318286.3109612889
RMSE 564.168690163934
MAPE 0.07398491535044392
r2_score 0.9799779904468846
