# **Import Modules**

In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
import matplotlib.pyplot as plt # plotting
%matplotlib inline
import seaborn as sns #sstatistical  data visualisation 

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost
import lightgbm
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn import metrics
from sklearn.metrics import mean_squared_error

# **Import Data**

In [2]:
train_df = pd.read_csv('/kaggle/input/shai-club/train.csv')
test_df = pd.read_csv('/kaggle/input/shai-club/test.csv')

# **Data Wrangling**

In [3]:
train_df.head()

In [4]:
#remove ID column
df = train_df.drop('Id', axis=1)
df.head()

In [5]:
df.describe()

In [6]:
df.info()

We can see there are:
- **2 Integer** type features.
- **6 Float type** features.
- **3 Object type** features.

So, we have to encode those categorical features as we can feed only numerical features into the machine learning model.

# **Data Cleaning**

Start cleaning data, first find out if there is any NULL values?

In [7]:
df.isnull().any()

Then, find out duplicated data.

In [8]:
df.duplicated()

In [9]:
df.duplicated().sum()

In [10]:
df.drop_duplicates(keep = 'first', inplace = True)
df.duplicated().sum()

In [11]:
df.shape

# **Data Preprocessing**

* **Encoding Categorical Variables:** convert categorical variables to numerical variables where the measurement or number has a numerical meaning.

In [12]:
# #one hot encoding method
# df = pd.get_dummies(df)
# df.head()

Using Ordinal Encoding:

In [13]:
# Get list of categorical variables
s = (df.dtypes =="object")
object_cols = list(s[s].index)
print("Categorical variables:", object_cols)

In [14]:
# Apply label_encoder to each column with categorical data
# label_encoder = LabelEncoder()
# for col in object_cols:
#     df[col] = label_encoder.fit_transform(df[col])

# le = LabelEncoder()
# df[object_cols] = df[object_cols].apply(le.fit_transform)

encoder = OrdinalEncoder()
df[object_cols] = encoder.fit_transform(df[object_cols])
df.head()

In [15]:
df.describe()

* **Handling outliers:** find out if there is outliers in the data that will affect the model.

In [16]:
df.hist(figsize = (10,10));

In [17]:
sns.pairplot(data=df,x_vars=["depth","table","carat",'x', 'y','z'],y_vars="price",diag_kind=None)

Graph here show some zero values we didn't detect before. **Let's make sure.**

In [18]:
temp = df[['x','y','z','depth','table','carat','price']].replace(0,np.NaN)
temp.isnull().sum()

In [19]:
#Dropping dimensionless diamonds
df.drop(df[df['x']==0].index,inplace=True)
df.drop(df[df['y']==0].index,inplace=True)
df.drop(df[df['z']==0].index,inplace=True)
df.shape

After delete **dimensionless data** [this is missing information if included will affect the outcome of the model results], let's find out more close to **outliers**.

In [20]:
data = df[['depth', 'table', 'carat', 'x', 'y', 'z']]
data.plot(kind='box',figsize=(20,10),subplots=True)
plt.show()

In [21]:
#Dropping the outliers. 
df = df[(df["y"]<20)]
df = df[(df["z"]<10) & (df["z"]>2)]
df = df[(df["carat"]<4)]
df = df[(df["table"]<80) & (df["table"]>45)]
df = df[(df["depth"]<80) & (df["depth"]>45)]
df.shape

In [22]:
sns.lmplot(x="price", y="y", data=df, line_kws={"color": 'red'})

In [23]:
sns.lmplot(x="price", y="z", data=df, line_kws={"color": 'red'})

In [24]:
sns.lmplot(x="price", y="depth", data=df, line_kws={"color": 'red'})

In [25]:
sns.lmplot(x="price", y="table", data=df, line_kws={"color": 'red'})

In [26]:
sns.lmplot(x="price", y="carat", data=df, line_kws={"color": 'red'})

In [27]:
# def iqr_outliers(column):
#     Q1,Q3 = np.percentile(column , [25,75])
#     iqr = Q3 - Q1
#     lower_range = Q1 - (1.5 * iqr)
#     upper_range = Q3 + (1.5 * iqr)
#     return lower_range,upper_range  

In [28]:
# caratlower,caratupper = iqr_outliers(df.carat)
# df.drop(df[ (df.carat > caratupper) | (df.carat < caratlower) ].index , inplace=True)

# **Feature Engineering**

In [29]:
# df['volume'] = df['x']*df['y']*df['z']

Eliminat some features that lead to the same result, as we see in our data **depth = z / mean(x, y)**

In [30]:
df.drop(labels=['x','y','z'],axis=1,inplace=True)
df.shape

In [31]:
plt.figure(figsize=(10, 10))
heatmap = sns.heatmap(df.corr(),annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

# **Model Building**

In [32]:
# Assigning the featurs as X and trarget as y
X= df.drop(["price"],axis =1)
y= df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [33]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [34]:
# For the sake of automation, let's create a function to train the model and generate the variance score 

def R2_function(regressor,X_train,y_train,X_test,y_test):
    regressor.fit(X_train,y_train)
    predictions = regressor.predict(X_test)
    return (metrics.explained_variance_score(y_test,predictions))

In [35]:
# Instanciating all the models that we are going to apply

rfr = RandomForestRegressor()
xgb = xgboost.XGBRegressor()
tree = DecisionTreeRegressor()
cat = CatBoostRegressor()
lgb = lightgbm.LGBMRegressor()

In [36]:
models_list = [rfr, xgb, tree, cat, lgb]

for model in models_list:
    print(f'{model} R2 score is: {R2_function(model,X_train,y_train,X_test,y_test)} \n')

In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
# #params_dict = {'objective':["reg:squarederror"], 'n_estimators':[40,60,80,100]}
# #GridSearch = GridSearchCV(estimator=xgb, param_grid=params_dict,scoring='r2')
# parameters = {'objective':["RMSE"], 'iterations':[1000,1500,2000,2500]}
# GridSearch = GridSearchCV(estimator=cat, param_grid=parameters, scoring='r2')
# GridSearch.fit(X,y)
# print("Done")

In [39]:
# GridSearch.best_params_

Get that iteration's 2000 best-parameters

In [40]:
GridSearch_BestParam = CatBoostRegressor(objective='RMSE', iterations=2000)
#GridSearch_BestParam = xgboost.XGBRegressor(objective="reg:squarederror", n_estimators=62)
#GridSearch_BestParam = RandomForestRegressor(n_estimators=80)

GridSearch_BestParam.fit(X_train,y_train)
predictions = GridSearch_BestParam.predict(X_test)
print(f"R2 score: {metrics.explained_variance_score(y_test,predictions)}")
print(f"Mean absolute error: {metrics.mean_absolute_error(y_test,predictions)}")
print(f"Mean squared error: {metrics.mean_squared_error(y_test,predictions)}")
print(f"Root Mean squared error: {np.sqrt(metrics.mean_squared_error(y_test,predictions))}")
model = GridSearch_BestParam.fit(X, y)

# **Prepare Testing**

In [41]:
test_df.head()

Applying what we made in the training data to be fit with testing data.

In [42]:
# test_df['volume'] = test_df['x']*test_df['y']*test_df['z']

In [43]:
test_df.drop(labels=['x','y','z'],axis=1,inplace=True)
test_df.shape

In [44]:
# #one hot encoding method for test
# test_df = pd.get_dummies(test_df)
# test_df.head()

In [45]:
# Get list of categorical variables
s2 = (test_df.dtypes =="object")
object_cols_test = list(s2[s2].index)
print("Categorical variables:", object_cols_test)

In [46]:
# Apply label_encoder to each column with categorical data
encoder = OrdinalEncoder()
test_df[object_cols] = encoder.fit_transform(test_df[object_cols])
test_df.head()

In [47]:
#remove ID column
test = test_df.drop('Id', axis=1)

In [48]:
predictions = pd.Series(model.predict(test))
pred = pd.DataFrame({'Id':test_df['Id'],'price':predictions})

# **File Output**

In [49]:
pred.to_csv('submission.csv',index=False)
pred.head()

In [50]:
pred.shape