# Imports

In [1]:
import sqlite3
import pandas as pd
import numpy as np


import sys
import re
import statistics

import plotly.express as px
import matplotlib.pyplot as plt
import cufflinks as cf
cf.go_offline()
import seaborn as sns

import math

# Read diamonds.db

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment',None)

In [3]:
#Import with sqlite3 the diamonds_train tables
conn = sqlite3.connect("../data/diamonds_train.db")
diamonds_city = pd.read_sql_query("SELECT * FROM diamonds_city", conn)
diamonds_clarity = pd.read_sql_query("SELECT * FROM diamonds_clarity", conn)
diamonds_color = pd.read_sql_query("SELECT * FROM diamonds_color", conn)
diamonds_cut = pd.read_sql_query("SELECT * FROM diamonds_cut", conn)
diamonds_dimensions = pd.read_sql_query("SELECT * FROM diamonds_dimensions", conn)
diamonds_properties = pd.read_sql_query("SELECT * FROM diamonds_properties", conn)
diamonds_transactional = pd.read_sql_query("SELECT * FROM diamonds_transactional", conn)

# Merge diamonds.db

In [4]:
#Merge the dataframes by the different id's
df1 = pd.merge(pd.merge(diamonds_dimensions, diamonds_properties, on='index_id'), diamonds_transactional, on='index_id')
df2 = pd.merge(df1, diamonds_cut, on = 'cut_id')
df3 = pd.merge(df2, diamonds_color, on = 'color_id')
df4 = pd.merge(df3, diamonds_clarity, on = 'clarity_id')
df5 = pd.merge(df4, diamonds_city, on = 'city_id')
#df5

In [5]:
#Keep only the columns that I need
df6 = df5[["depth", "table", "x", "y", "z", "price", "carat", "cut", "color", "clarity", 'city']]
diamonds_train = df6[["price", "carat", "cut", "color", "clarity", "depth", "table", "x", "y", "z"]]
diamonds_train_sorted = diamonds_train.sort_values(by='price').reset_index(drop=True)
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
1,326,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
2,327,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,334,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,335,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [38]:
"""
Explanation of the diamonds Features:
price: price in USD
carat: weight of the diamond
cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
color: diamond colour, from J (worst) to D (best)
clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
x: length in mm
y: width in mm
z: depth in mm
depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
table: width of top of diamond relative to widest point (43--95)
city: city where the diamonds is reported to be sold.
"""

'\nExplanation of the diamonds Features:\nprice: price in USD\ncarat: weight of the diamond\ncut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)\ncolor: diamond colour, from J (worst) to D (best)\nclarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))\nx: length in mm\ny: width in mm\nz: depth in mm\ndepth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)\ntable: width of top of diamond relative to widest point (43--95)\ncity: city where the diamonds is reported to be sold.\n'

# Label encoding diamonds.db

In [6]:
#Label encoding of cut color and clarity giving values to each type, 
#according to the information we have about wich is more valuable than the other
cut_encoding = {'Premium': 3, 'Very Good': 2, 'Fair': 0, 'Good': 1, 'Ideal': 4}
def cut_label_encoding(x):
    for key in cut_encoding:
        if x == key:
            return cut_encoding[key]

In [7]:
diamonds_train_sorted['cut'] = diamonds_train_sorted['cut'].apply(cut_label_encoding)
diamonds_train_sorted

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.21,3,E,SI1,59.8,61.0,3.89,3.84,2.31
1,326,0.23,4,E,SI2,61.5,55.0,3.95,3.98,2.43
2,327,0.23,1,E,VS1,56.9,65.0,4.05,4.07,2.31
3,334,0.29,3,I,VS2,62.4,58.0,4.20,4.23,2.63
4,335,0.31,1,J,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
40450,18795,2.04,3,H,SI1,58.1,60.0,8.37,8.28,4.84
40451,18797,2.29,3,I,SI1,61.8,59.0,8.52,8.45,5.24
40452,18806,1.51,4,G,IF,61.7,55.0,7.37,7.41,4.56
40453,18818,2.00,2,G,SI1,63.5,56.0,7.90,7.97,5.04


In [8]:
color_encoding = {'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D': 6}
def color_label_encoding(x):
    for key in color_encoding:
        if x == key:
            return color_encoding[key]

In [9]:
diamonds_train_sorted['color'] = diamonds_train_sorted['color'].apply(color_label_encoding)
diamonds_train_sorted

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.21,3,5,SI1,59.8,61.0,3.89,3.84,2.31
1,326,0.23,4,5,SI2,61.5,55.0,3.95,3.98,2.43
2,327,0.23,1,5,VS1,56.9,65.0,4.05,4.07,2.31
3,334,0.29,3,1,VS2,62.4,58.0,4.20,4.23,2.63
4,335,0.31,1,0,SI2,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
40450,18795,2.04,3,2,SI1,58.1,60.0,8.37,8.28,4.84
40451,18797,2.29,3,1,SI1,61.8,59.0,8.52,8.45,5.24
40452,18806,1.51,4,3,IF,61.7,55.0,7.37,7.41,4.56
40453,18818,2.00,2,3,SI1,63.5,56.0,7.90,7.97,5.04


In [10]:
clarity_encoding = {'I1': 0, 'SI2': 1, 'SI1': 2, 'VS2': 3, 'VS1': 4, 'VVS2': 5, 'VVS1': 6, 'IF': 7}
def clarity_label_encoding(x):
    for key in clarity_encoding:
        if x == key:
            return clarity_encoding[key]

In [11]:
diamonds_train_sorted['clarity'] = diamonds_train_sorted['clarity'].apply(clarity_label_encoding)
diamonds_train_sorted

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.21,3,5,2,59.8,61.0,3.89,3.84,2.31
1,326,0.23,4,5,1,61.5,55.0,3.95,3.98,2.43
2,327,0.23,1,5,4,56.9,65.0,4.05,4.07,2.31
3,334,0.29,3,1,3,62.4,58.0,4.20,4.23,2.63
4,335,0.31,1,0,1,63.3,58.0,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
40450,18795,2.04,3,2,2,58.1,60.0,8.37,8.28,4.84
40451,18797,2.29,3,1,2,61.8,59.0,8.52,8.45,5.24
40452,18806,1.51,4,3,7,61.7,55.0,7.37,7.41,4.56
40453,18818,2.00,2,3,2,63.5,56.0,7.90,7.97,5.04


# Feature engineering

In [38]:
#diamonds_train_sorted['table_xy'] = (diamonds_train_sorted['table'].mean()*(diamonds_train_sorted['x']*diamonds_train_sorted['y']).mean()-diamonds_train_sorted['table']*(diamonds_train_sorted['x']*diamonds_train_sorted['y']))
#diamonds_train_sorted['depth_z'] = (diamonds_train_sorted['depth'].mean()*diamonds_train_sorted['z'].mean()-diamonds_train_sorted['depth']*diamonds_train_sorted['z'])
#diamonds_train_sorted['volume'] = diamonds_train_sorted['x']*diamonds_train_sorted['y']*diamonds_train_sorted['z']
#diamonds_train_sorted['x_y_z'] = (2*diamonds_train_sorted['z'])/(diamonds_train_sorted['x']+diamonds_train_sorted['y'])
#diamonds_train_sorted['dtc'] = diamonds_train_sorted['depth'] / diamonds_train_sorted['table'] * diamonds_train_sorted['carat']

In [39]:
# Ratio lenght widht is very important because is related to the diamonds shape. The shape influences a lot diamonds price
diamonds_train_sorted['ratio_length_width'] = diamonds_train_sorted['x']/diamonds_train_sorted['y']
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width,carat_log,shape
0,326,0.21,3,5,2,59.8,61.0,3.89,3.84,2.31,1.013021,-1.560648,3
1,326,0.23,4,5,1,61.5,55.0,3.95,3.98,2.43,0.992462,-1.469676,7
2,327,0.23,1,5,4,56.9,65.0,4.05,4.07,2.31,0.995086,-1.469676,2
3,334,0.29,3,1,3,62.4,58.0,4.2,4.23,2.63,0.992908,-1.237874,6
4,335,0.31,1,0,1,63.3,58.0,4.34,4.35,2.75,0.997701,-1.171183,6


In [13]:
#Create the logarithm of carat because the values of carat increase exponential and not lineal with the price
carat_log= []
for i in diamonds_train_sorted['carat']:
    carat_log.append(math.log(i))
diamonds_train_sorted['carat_log'] = carat_log
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width,carat_log
0,326,0.21,3,5,2,59.8,61.0,3.89,3.84,2.31,1.013021,-1.560648
1,326,0.23,4,5,1,61.5,55.0,3.95,3.98,2.43,0.992462,-1.469676
2,327,0.23,1,5,4,56.9,65.0,4.05,4.07,2.31,0.995086,-1.469676
3,334,0.29,3,1,3,62.4,58.0,4.2,4.23,2.63,0.992908,-1.237874
4,335,0.31,1,0,1,63.3,58.0,4.34,4.35,2.75,0.997701,-1.171183


# Create diamond shape

In [14]:
# The shape of the diamonds influences its price. I found information about which percenteges of table and depth 
# determine the type of shape, and apply it in this for loop
shape = []
for i in diamonds_train_sorted['table'].index:
    if 54<diamonds_train_sorted['table'][i]<57 and 59<diamonds_train_sorted['depth'][i]<62.6:
        shape.append('Round')
    elif diamonds_train_sorted['table'][i]<68 and 61<diamonds_train_sorted['depth'][i]<68:
        shape.append('Cushion')
    elif 69<diamonds_train_sorted['table'][i]<75 and 68<diamonds_train_sorted['depth'][i]<74:
        shape.append('Princess')
    elif 60<diamonds_train_sorted['table'][i]<68 and 61<diamonds_train_sorted['depth'][i]<68:
        shape.append('Emerald')    
    elif 53<diamonds_train_sorted['table'][i]<65 and diamonds_train_sorted['depth'][i]<68:
        shape.append('Oval')
    elif 61<diamonds_train_sorted['table'][i]<69 and diamonds_train_sorted['depth'][i]<67:
        shape.append('Radiant')
    elif 53<diamonds_train_sorted['table'][i]<63 and 56<diamonds_train_sorted['depth'][i]<62:
        shape.append('Heart')   
    else:
        shape.append('others')

In [15]:
diamonds_train_sorted['shape'] = shape
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width,carat_log,shape
0,326,0.21,3,5,2,59.8,61.0,3.89,3.84,2.31,1.013021,-1.560648,Oval
1,326,0.23,4,5,1,61.5,55.0,3.95,3.98,2.43,0.992462,-1.469676,Round
2,327,0.23,1,5,4,56.9,65.0,4.05,4.07,2.31,0.995086,-1.469676,Radiant
3,334,0.29,3,1,3,62.4,58.0,4.2,4.23,2.63,0.992908,-1.237874,Cushion
4,335,0.31,1,0,1,63.3,58.0,4.34,4.35,2.75,0.997701,-1.171183,Cushion


# Shape label encoding

In [16]:
# I turn the shape into given values according to how they are related with price
diamonds_train_sorted['shape']=diamonds_train_sorted['shape'].map({'Round':7, 'Cushion':6, 'Princess': 5, 
                                                     'Emerald':4,'Oval': 3, 'Radiant': 2,
                                                  'Heart': 1, 'others':0})
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width,carat_log,shape
0,326,0.21,3,5,2,59.8,61.0,3.89,3.84,2.31,1.013021,-1.560648,3
1,326,0.23,4,5,1,61.5,55.0,3.95,3.98,2.43,0.992462,-1.469676,7
2,327,0.23,1,5,4,56.9,65.0,4.05,4.07,2.31,0.995086,-1.469676,2
3,334,0.29,3,1,3,62.4,58.0,4.2,4.23,2.63,0.992908,-1.237874,6
4,335,0.31,1,0,1,63.3,58.0,4.34,4.35,2.75,0.997701,-1.171183,6


In [45]:
"""
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Amsterdam','Europe')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Antwerp','Europe')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('London','Europe')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Luxembourg','Europe')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Madrid','Europe')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Paris','Europe')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Zurich','Europe')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Dubai','Asia')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Surat','Asia')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Tel Aviv','Asia')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Kimberly','Africa')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Las Vegas','North America')
diamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('New York City','North America')
diamonds_train_sorted = pd.get_dummies(diamonds_train_sorted, columns = ['city'])
diamonds_train_sorted
"""

"\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Amsterdam','Europe')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Antwerp','Europe')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('London','Europe')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Luxembourg','Europe')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Madrid','Europe')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Paris','Europe')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Zurich','Europe')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Dubai','Asia')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Surat','Asia')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['city'].str.replace('Tel Aviv','Asia')\ndiamonds_train_sorted['city'] = diamonds_train_sorted['c

In [46]:
"""
diamonds_train_sorted['city_Africa'] = diamonds_train_sorted['city_Africa'].astype(int)
diamonds_train_sorted['city_Asia'] = diamonds_train_sorted['city_Asia'].astype(int)
diamonds_train_sorted['city_Europe'] = diamonds_train_sorted['city_Europe'].astype(int)
diamonds_train_sorted['city_North America'] = diamonds_train_sorted['city_North America'].astype(int)
diamonds_train_sorted
"""

"\ndiamonds_train_sorted['city_Africa'] = diamonds_train_sorted['city_Africa'].astype(int)\ndiamonds_train_sorted['city_Asia'] = diamonds_train_sorted['city_Asia'].astype(int)\ndiamonds_train_sorted['city_Europe'] = diamonds_train_sorted['city_Europe'].astype(int)\ndiamonds_train_sorted['city_North America'] = diamonds_train_sorted['city_North America'].astype(int)\ndiamonds_train_sorted\n"

# Dropping dimentionless diamonds

In [17]:
# there are diamond with vale 0 in x, y and z, I drop those rows
diamonds_train_sorted = diamonds_train_sorted.drop(diamonds_train_sorted[diamonds_train_sorted["x"] == 0].index)
diamonds_train_sorted = diamonds_train_sorted.drop(diamonds_train_sorted[diamonds_train_sorted["y"] == 0].index)
diamonds_train_sorted = diamonds_train_sorted.drop(diamonds_train_sorted[diamonds_train_sorted["z"] == 0].index)


# Dropping the outliers

In [18]:
# We also drop outliers 
diamonds_train_sorted = diamonds_train_sorted[(diamonds_train_sorted["depth"]<75)&(diamonds_train_sorted["depth"]>45)]
diamonds_train_sorted = diamonds_train_sorted[(diamonds_train_sorted["table"]<80)&(diamonds_train_sorted["table"]>40)]
diamonds_train_sorted = diamonds_train_sorted[(diamonds_train_sorted["x"]<30)]
diamonds_train_sorted = diamonds_train_sorted[(diamonds_train_sorted["y"]<30)]
diamonds_train_sorted = diamonds_train_sorted[(diamonds_train_sorted["z"]<30)&(diamonds_train_sorted["z"]>2)]


# Generate csv with the cleaned data

In [19]:
diamonds_train_sorted.to_csv('../data/diamonds_train_sorted.csv', index=False)

# Read diamonds_test

In [20]:
diamonds_test = pd.read_csv('../data/diamonds_test.csv')
diamonds_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,Kimberly
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam


# Label encoding diamonds_test

In [21]:
#Label Encoding
cut_encoding = {'Premium': 3, 'Very Good': 2, 'Fair': 0, 'Good': 1, 'Ideal': 4}
def cut_label_encoding(x):
    for key in cut_encoding:
        if x == key:
            return cut_encoding[key]

In [22]:
diamonds_test['cut'] = diamonds_test['cut'].apply(cut_label_encoding)

In [23]:
color_encoding = {'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D': 6}
def color_label_encoding(x):
    for key in color_encoding:
        if x == key:
            return color_encoding[key]

In [24]:
diamonds_test['color'] = diamonds_test['color'].apply(color_label_encoding)

In [25]:
clarity_encoding = {'I1': 0, 'SI2': 1, 'SI1': 2, 'VS2': 3, 'VS1': 4, 'VVS2': 5, 'VVS1': 6, 'IF': 7}
def clarity_label_encoding(x):
    for key in clarity_encoding:
        if x == key:
            return clarity_encoding[key]

In [26]:
diamonds_test['clarity'] = diamonds_test['clarity'].apply(clarity_label_encoding)

In [27]:
diamonds_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.2,4,0,4,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.9,2,4,2,63.8,54.0,6.09,6.13,3.9,Kimberly
4,4,0.5,2,4,4,62.9,58.0,5.05,5.09,3.19,Amsterdam


In [28]:
diamonds_test_processed = diamonds_test[["carat", "cut", "color", "clarity", "depth", "table", "x", "y", "z"]]
diamonds_test_processed.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67
1,1.2,4,0,4,61.0,57.0,6.81,6.89,4.18
2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57
3,0.9,2,4,2,63.8,54.0,6.09,6.13,3.9
4,0.5,2,4,4,62.9,58.0,5.05,5.09,3.19


In [59]:
"""
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Amsterdam','Europe')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Antwerp','Europe')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('London','Europe')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Luxembourg','Europe')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Madrid','Europe')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Paris','Europe')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Zurich','Europe')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Dubai','Asia')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Surat','Asia')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Tel Aviv','Asia')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Kimberly','Africa')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Las Vegas','North America')
diamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('New York City','North America')
diamonds_test_processed = pd.get_dummies(diamonds_test_processed, columns = ['city'])
diamonds_test_processed
"""

"\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Amsterdam','Europe')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Antwerp','Europe')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('London','Europe')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Luxembourg','Europe')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Madrid','Europe')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Paris','Europe')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Zurich','Europe')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Dubai','Asia')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Surat','Asia')\ndiamonds_test_processed['city'] = diamonds_test_processed['city'].str.replace('Tel Aviv','Asia')\ndiamonds_test_pr

In [60]:
"""
diamonds_test_processed['city_Africa'] = diamonds_test_processed['city_Africa'].astype(int)
diamonds_test_processed['city_Asia'] = diamonds_test_processed['city_Asia'].astype(int)
diamonds_test_processed['city_Europe'] = diamonds_test_processed['city_Europe'].astype(int)
diamonds_test_processed['city_North America'] = diamonds_test_processed['city_North America'].astype(int)
diamonds_test_processed
"""

"\ndiamonds_test_processed['city_Africa'] = diamonds_test_processed['city_Africa'].astype(int)\ndiamonds_test_processed['city_Asia'] = diamonds_test_processed['city_Asia'].astype(int)\ndiamonds_test_processed['city_Europe'] = diamonds_test_processed['city_Europe'].astype(int)\ndiamonds_test_processed['city_North America'] = diamonds_test_processed['city_North America'].astype(int)\ndiamonds_test_processed\n"

# Feature engineering test dataset

In [30]:
#diamonds_test_basic['volume'] = diamonds_test_basic['x']*diamonds_test_basic['y']*diamonds_test_basic['z']
#diamonds_test_basic['dtc'] = diamonds_test_basic['depth'] / diamonds_test_basic['table'] * diamonds_test_basic['carat']
diamonds_test_processed['ratio_length_width'] = diamonds_test_processed['x']/diamonds_test_processed['y']
diamonds_test_processed.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width
0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67,0.988115
1,1.2,4,0,4,61.0,57.0,6.81,6.89,4.18,0.988389
2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57,1.008197
3,0.9,2,4,2,63.8,54.0,6.09,6.13,3.9,0.993475
4,0.5,2,4,4,62.9,58.0,5.05,5.09,3.19,0.992141


In [31]:
carat_log= []
for i in diamonds_test_processed['carat']:
    carat_log.append(math.log(i))
diamonds_test_processed['carat_log'] = carat_log
diamonds_test_processed.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width,carat_log
0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67,0.988115,-0.235722
1,1.2,4,0,4,61.0,57.0,6.81,6.89,4.18,0.988389,0.182322
2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57,1.008197,0.451076
3,0.9,2,4,2,63.8,54.0,6.09,6.13,3.9,0.993475,-0.105361
4,0.5,2,4,4,62.9,58.0,5.05,5.09,3.19,0.992141,-0.693147


# Create diamond shape test dataset

In [32]:
shape = []
for i in diamonds_test_processed['table'].index:
    if 54<diamonds_test_processed['table'][i]<57 and 59<diamonds_test_processed['depth'][i]<62.6:
        shape.append('Round')
    elif diamonds_test_processed['table'][i]<68 and 61<diamonds_test_processed['depth'][i]<68:
        shape.append('Cushion')
    elif 69<diamonds_test_processed['table'][i]<75 and 68<diamonds_test_processed['depth'][i]<74:
        shape.append('Princess')
    elif 60<diamonds_test_processed['table'][i]<68 and 61<diamonds_test_processed['depth'][i]<68:
        shape.append('Emerald')    
    elif 53<diamonds_test_processed['table'][i]<65 and diamonds_test_processed['depth'][i]<68:
        shape.append('Oval')
    elif 61<diamonds_test_processed['table'][i]<69 and diamonds_test_processed['depth'][i]<67:
        shape.append('Radiant')
    elif 53<diamonds_test_processed['table'][i]<63 and 56<diamonds_test_processed['depth'][i]<62:
        shape.append('Heart')   
    else:
        shape.append('others')

In [34]:
diamonds_test_processed['shape'] = shape
diamonds_test_processed.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width,carat_log,shape
0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67,0.988115,-0.235722,Cushion
1,1.2,4,0,4,61.0,57.0,6.81,6.89,4.18,0.988389,0.182322,Oval
2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57,1.008197,0.451076,Cushion
3,0.9,2,4,2,63.8,54.0,6.09,6.13,3.9,0.993475,-0.105361,Cushion
4,0.5,2,4,4,62.9,58.0,5.05,5.09,3.19,0.992141,-0.693147,Cushion


# Shape label encoding test dataset

In [35]:
diamonds_test_processed['shape']=diamonds_test_processed['shape'].map({'Round':7, 'Cushion':6, 'Princess': 5, 
                                                     'Emerald':4,'Oval': 3, 'Radiant': 2,
                                                  'Heart': 1, 'others':0})
diamonds_test_processed.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width,carat_log,shape
0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67,0.988115,-0.235722,6
1,1.2,4,0,4,61.0,57.0,6.81,6.89,4.18,0.988389,0.182322,3
2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57,1.008197,0.451076,6
3,0.9,2,4,2,63.8,54.0,6.09,6.13,3.9,0.993475,-0.105361,6
4,0.5,2,4,4,62.9,58.0,5.05,5.09,3.19,0.992141,-0.693147,6


# Turn the zero values

In [36]:
# As we can't erase any rows in the test dataset, we have to turn zero values of x, y and z into the meadian
def remove_column_zero(col):
    for i in diamonds_test_processed[col].index:
        if diamonds_test_processed[col][i]==0:
            diamonds_test_processed[col][i] = diamonds_test_processed[col].median()
remove_column_zero('x')
remove_column_zero('y')
remove_column_zero('z')
diamonds_test_processed.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,ratio_length_width,carat_log,shape
0,0.79,2,4,2,62.7,60.0,5.82,5.89,3.67,0.988115,-0.235722,6
1,1.2,4,0,4,61.0,57.0,6.81,6.89,4.18,0.988389,0.182322,3
2,1.57,3,2,2,62.2,61.0,7.38,7.32,4.57,1.008197,0.451076,6
3,0.9,2,4,2,63.8,54.0,6.09,6.13,3.9,0.993475,-0.105361,6
4,0.5,2,4,4,62.9,58.0,5.05,5.09,3.19,0.992141,-0.693147,6


# Dropping dimentionless diamonds

In [62]:
"""
diamonds_test_processed = diamonds_test_processed.drop(diamonds_test_processed[diamonds_test_processed["x"] == 0].index)
diamonds_test_processed = diamonds_test_processed.drop(diamonds_test_processed[diamonds_test_processed["y"] == 0].index)
diamonds_test_processed = diamonds_test_processed.drop(diamonds_test_processed[diamonds_test_processed["z"] == 0].index)
"""

'\ndiamonds_test_processed = diamonds_test_processed.drop(diamonds_test_processed[diamonds_test_processed["x"] == 0].index)\ndiamonds_test_processed = diamonds_test_processed.drop(diamonds_test_processed[diamonds_test_processed["y"] == 0].index)\ndiamonds_test_processed = diamonds_test_processed.drop(diamonds_test_processed[diamonds_test_processed["z"] == 0].index)\n'

# Dropping the outliers

In [63]:
"""
diamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["depth"]<75)&(diamonds_test_processed["depth"]>45)]
diamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["table"]<80)&(diamonds_test_processed["table"]>40)]
diamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["x"]<30)]
diamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["y"]<30)]
diamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["z"]<30)&(diamonds_test_processed["z"]>2)]
"""

'\ndiamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["depth"]<75)&(diamonds_test_processed["depth"]>45)]\ndiamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["table"]<80)&(diamonds_test_processed["table"]>40)]\ndiamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["x"]<30)]\ndiamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["y"]<30)]\ndiamonds_test_processed = diamonds_test_processed[(diamonds_test_processed["z"]<30)&(diamonds_test_processed["z"]>2)]\n'

In [37]:
diamonds_test_processed.to_csv('../data/diamonds_test_processed.csv', index=False)