In [None]:
%reset -fs

# import all libraries

In [None]:
import pandas as pd
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
diamond_df = pd.read_csv('diamonds.csv', index_col = 0)

In [None]:
diamond_df.head(20)

In [None]:
diamond_df.shape

In [None]:
diamond_df.isna().sum()

In [None]:
diamond_df.dtypes

In [None]:
diamond_df.cut.value_counts()

In [None]:
color_mapping = {'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D': 6}
diamond_df.color = diamond_df.color.map(color_mapping)

In [None]:
diamond_df.clarity.value_counts()

In [None]:
clarity_mapping = {'I1': 0, 'SI2': 1, 'SI1': 2, 'VS2': 3, 'VS1': 4, 'VVS2': 5, 'VVS1': 6, 'IF': 7}
diamond_df.clarity = diamond_df.clarity.map(clarity_mapping)

In [None]:
diamond_df.describe()

In [None]:
diamond_df = diamond_df.drop(diamond_df[diamond_df["x"]==0].index)
diamond_df = diamond_df.drop(diamond_df[diamond_df["y"]==0].index)
diamond_df = diamond_df.drop(diamond_df[diamond_df["z"]==0].index)

In [None]:
diamond_df.shape

In [None]:
diamond_df = diamond_df[diamond_df['depth'] < diamond_df['depth'].quantile(0.99)]
diamond_df = diamond_df[diamond_df['table'] < diamond_df['table'].quantile(0.99)]
diamond_df = diamond_df[diamond_df['x'] < diamond_df['x'].quantile(0.99)]
diamond_df = diamond_df[diamond_df['y'] < diamond_df['y'].quantile(0.99)]
diamond_df = diamond_df[diamond_df['z'] < diamond_df['z'].quantile(0.99)]

In [None]:
model = diamond_df.copy()

In [None]:
model

In [None]:
f, ax = plt.subplots(figsize=(10,10))
sns.heatmap(model.corr(), annot=True, cmap='coolwarm');

In [None]:
X = model.drop(['price'], axis=1)
y = model['price']

In [None]:
X

In [None]:
X_train, X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
X_train

In [None]:
X_test

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=0)

# regressor code

In [None]:
xgb1 = XGBRegressor()
parameters = {
              'objective':['reg:squarederror'],
              'learning_rate': [.0001, 0.001, .01],
              'max_depth': [3, 5, 7],
              'min_child_weight': [3,5,7],
              'subsample': [0.1,0.5,1.0],
              'colsample_bytree': [0.1, 0.5, 1.0],
              'n_estimators': [500]}

In [None]:
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 3,
                        n_jobs = -1,
                        verbose=0)

In [None]:
xgb_grid.fit(X_train, y_train)

In [None]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
xgb_cv = (xgb_grid.best_estimator_)
eval_set = [(X_train, y_train),
            (X_val, y_val)]



In [None]:
fit_model = xgb_cv.fit(
    X_train,
    y_train,
    eval_set=eval_set,
    eval_metric='mae',
    early_stopping_rounds=50,
    verbose=False)

In [None]:
print("MAE:", mean_absolute_error(y_val, fit_model.predict(X_val)))
print("MSE:", mean_squared_error(y_val, fit_model.predict(X_val)))
print("R2:", r2_score(y_val, fit_model.predict(X_val)))



In [None]:
print("MAE:", mean_absolute_error(y_test, fit_model.predict(X_test)))
print("MSE:", mean_squared_error(y_test, fit_model.predict(X_test)))
print("R2:", r2_score(y_test, fit_model.predict(X_test)))

In [None]:

fit_model.save_model('xgb_model.json')